library("PSCBS")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Simulating copy-number data
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
set.seed(0xBEEF)

# Number of loci
J <- 1000

x <- sort(runif(J, max=J)) * 1e5

mu <- double(J)
mu[200:300] <- mu[200:300] + 1
mu[350:400] <- NA # centromere
mu[650:800] <- mu[650:800] - 1
eps <- rnorm(J, sd=1/2)
y <- mu + eps

outliers <- seq(from=1L, to=J, length.out=0.2*J)
y[outliers] <- y[outliers] + 1.5

w <- rep(1.0, times=J)
w[outliers] <- 0.01

data <- data.frame(chromosome=1L, x=x, y=y)
dataW <- cbind(data, w=w)


par(mar=c(2,3,0.2,1)+0.1)


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Single-chromosome segmentation
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Segment without weights
fit <- segmentByCBS(data)
sampleName(fit) <- "CBS_Example"
print(fit)
plotTracks(fit)
## Highlight outliers (they pull up the mean levels)
points(x[outliers]/1e6, y[outliers], col="purple")

# Segment with weights
fitW <- segmentByCBS(dataW)
sampleName(fitW) <- "CBS_Example (weighted)"
print(fitW)
drawLevels(fitW, col="red")

legend("topright", bg="white", legend=c("outliers", "non-weighted CBS", "weighted CBS"), col=c("purple", "purple", "red"), lwd=c(NA,3,3), pch=c(1,NA,NA))

## Assert that weighted segment means are less biased
dmean <- getSegments(fit)$mean - getSegments(fitW)$mean
cat("Segment mean differences:\n")
print(dmean)
stopifnot(all(dmean > 0, na.rm=TRUE))


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Segmentation with some known change points
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
knownSegments <- data.frame(
  chromosome=c(    1,   1),
  start     =x[c(  1, 401)],
  end       =x[c(349,   J)]
)
fit2 <- segmentByCBS(dataW, knownSegments=knownSegments, verbose=TRUE)
sampleName(fit2) <- "CBS_Example_2 (weighted)"
print(fit2)
plotTracks(fit2)
abline(v=c(knownSegments$start, knownSegments$end)/1e6, lty=3)


# Chromosome boundaries can be specified as -Inf and +Inf
knownSegments <- data.frame(
  chromosome=c(     1,      1),
  start     =c(  -Inf, x[401]),
  end       =c(x[349],   +Inf)
)
fit2b <- segmentByCBS(dataW, knownSegments=knownSegments, verbose=TRUE)
sampleName(fit2b) <- "CBS_Example_2b (weighted)"
print(fit2b)
plotTracks(fit2b)
abline(v=c(knownSegments$start, knownSegments$end)/1e6, lty=3)


# As a proof of concept, it is possible to segment just the centromere,
# which contains no data.  All statistics will be NAs.
knownSegments <- data.frame(
  chromosome=c(    1),
  start     =x[c(350)],
  end       =x[c(400)]
)
fit3 <- segmentByCBS(dataW, knownSegments=knownSegments, verbose=TRUE)
sampleName(fit3) <- "CBS_Example_3"
print(fit3)
plotTracks(fit3, Clim=c(0,5), xlim=c(0,100))
abline(v=c(knownSegments$start, knownSegments$end)/1e6, lty=3)


# If one specify the (empty) centromere as a segment, then its
# estimated statistics will be NAs, which becomes a natural
# separator between the two "independent" arms.
knownSegments <- data.frame(
  chromosome=c(    1,   1,   1),
  start     =x[c(  1, 350, 401)],
  end       =x[c(349, 400,   J)]
)
fit4 <- segmentByCBS(dataW, knownSegments=knownSegments, verbose=TRUE)
sampleName(fit4) <- "CBS_Example_4"
print(fit4)
plotTracks(fit4)
abline(v=c(knownSegments$start, knownSegments$end)/1e6, lty=3)


fit5 <- segmentByCBS(dataW, knownSegments=knownSegments, undo=Inf, verbose=TRUE)
sampleName(fit5) <- "CBS_Example_5"
print(fit5)
plotTracks(fit5)
abline(v=c(knownSegments$start, knownSegments$end)/1e6, lty=3)
stopifnot(nbrOfSegments(fit5) == nrow(knownSegments))


# One can also force a separator between two segments by setting
# 'start' and 'end' to NAs ('chromosome' has to be given)
knownSegments <- data.frame(
  chromosome=c(    1,  1,   1),
  start     =x[c(  1, NA, 401)],
  end       =x[c(349, NA,   J)]
)
fit6 <- segmentByCBS(dataW, knownSegments=knownSegments, verbose=TRUE)
sampleName(fit6) <- "CBS_Example_6"
print(fit6)
plotTracks(fit6)
abline(v=c(knownSegments$start, knownSegments$end)/1e6, lty=3)


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Multi-chromosome segmentation
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
data2 <- data
data2$chromosome <- 2L
data <- rbind(data, data2)
dataW <- cbind(data, w=w)

par(mar=c(2,3,0.2,1)+0.1)
# Segment without weights
fit <- segmentByCBS(data)
sampleName(fit) <- "CBS_Example"
print(fit)
plotTracks(fit, Clim=c(-3,3))

# Segment with weights
fitW <- segmentByCBS(dataW)
sampleName(fitW) <- "CBS_Example (weighted)"
print(fitW)
drawLevels(fitW, col="red")

legend("topright", bg="white", legend=c("outliers", "non-weighted CBS", "weighted CBS"), col=c("purple", "purple", "red"), lwd=c(NA,3,3), pch=c(1,NA,NA))

## Assert that weighted segment means are less biased
dmean <- getSegments(fit)$mean - getSegments(fitW)$mean
cat("Segment mean differences:\n")
print(dmean)
stopifnot(all(dmean > 0, na.rm=TRUE))