# R course on copy numbers using ASCAT and biomaRt

#######################################################################
##### Exercise 1: Paired parent-specific copy number segmentation #####
#######################################################################

#Navigate to the Affymetrix directory
#Start an R session


library("aroma.affymetrix");
verbose <- Arguments$getVerbose(-10, timestamp=TRUE);
 
#The data library holds a subset of the public dataset GSE12702, available at GEO
dataSet <- "GSE12702";
chipType <- "Mapping250K_Nsp";
 
# Load all samples 
csRall <- AffymetrixCelSet$byName(dataSet, chipType=chipType);
 
# The data includes two tumor-normal pairs 
# Extract tumor-normal pair of interest (pick ONE of these two lines)
pair <- c(T="GSM318734", N="GSM318735");
pair <- c(T="GSM318736", N="GSM318737");
# T is the tumor, N is the matched normal

#Now we extract the pair of interest from our cel set
csR <- extract(csRall, indexOf(csRall, pair));
print(csR);


#Allele-specific copy-number estimates
#Here we will use an allele-specific version of CRMAv2 to estimate total CNs (TCNs) and allele B fractions (BAFs) from the two CEL files.
#AS-CRMAv2 will take care of signal normalization etc:

res <- doASCRMAv2(csR, verbose=verbose);

#This will take a couple of minutes per array.
#Results are saved to disk, so if you need to work with this data in the future, Aroma will just load the normalized data.


#Extracting PSCN estimates for the tumor-normal pair
#Next we need to extract TCNs and BAFs for the tumor and the matched normal:

# Extract (total,beta) estimates for the tumor-normal pair
data <- extractPSCNArray(res$total);
dimnames(data)[[3]] <- names(pair);
str(data);

#Actually, at this stage the TCNs have not been standardized toward a reference.  
#Here we will calculate TCNs as C = 2*T/N where T and N are the total "CN" signals for the tumor and the matched normal, respectively.

# Total CNs for the tumor relative to the matched normal
CT <- 2 * (data[,"total","T"] / data[,"total","N"]);
 
# Allele B fractions for the tumor
betaT <- data[,"fracB","T"];
 
# Allele B fractions for the normal
betaN <- data[,"fracB","N"];
#What is remaining is to get the genomic locations for these data points:

# Get (chromosome, position) annotation data
ugp <- getAromaUgpFile(res$total);
chromosome <- ugp[,1,drop=TRUE];
x <- ugp[,2,drop=TRUE];


#We now have all the PSCN data we need:

# Setup data structure for Paired PSCBS
df <- data.frame(chromosome=chromosome, x=x, CT=CT, betaT=betaT, betaN=betaN);

#Parent-specific copy-number segmentation
#We next use Paired PSCBS to segment the above PSCN estimates:

library("PSCBS");
 
fit <- segmentByPairedPSCBS(df, verbose=verbose);
#Note that this may take several minutes (per tumor-normal pair).

#To access the table of identified sements, use the getSegments() method, which returns a data.frame (so it can easily be written to file):

segs <- getSegments(fit);
print(segs);
#For more details on Paired PSCBS options, see help("segmentByPairedPSCBS", package="PSCBS").

#We can plot the TCN, the decrease-of-heterozygosity (DH), and the minor-major CN estimates as follows:
pairName <- paste(pair, collapse="vs");
chrTag <- sprintf("Chr%s", seqToHumanReadable(getChromosomes(fit)));
 
#Using toPNG(), images files are by default saved in figures/.	
toPNG(pairName, tags=c(chrTag, "PairedPSCBS"), width=840, aspectRatio=0.6, {
  plotTracks(fit);
});

#We can further call regions of LOH and by bootstrapping
fit.LOH <- bootstrapTCNandDHByRegion(fit, verbose=verbose)
deltaAB <- estimateDeltaAB(fit.LOH, flavor="qq(DH)", verbose=verbose)
fit.LOH <- callAB(fit.LOH, delta=deltaAB, verbose=verbose)
deltaLOH <- estimateDeltaLOH(fit.LOH, flavor="minC1|nonAB", verbose=verbose) 
fit.LOH <- callLOH(fit.LOH, delta=deltaLOH, verbose=verbose)

toPNG(pairName, tags=c(chrTag, "PairedPSCBS.LOH"), width=840, aspectRatio=0.6, {
  plotTracks(fit.LOH);
});


######################################################################
###################### Exercise 2: ASCAT analysis ####################
######################################################################
#Navigate to the ASCAT directory
#Start an R session

source("ascat.R")

#Load the data
ascat.bc <- ascat.loadData(Tumor_LogR_file='Tumor_LogR.txt', Tumor_BAF_file='Tumor_BAF.txt', Germline_LogR_file='Germline_LogR.txt', Germline_BAF_file='Germline_BAF.txt')

#Plot the raw data
ascat.plotRawData(ascat.bc)

#Segment the data
ascat.bc <- ascat.aspcf(ascat.bc)

#Plot the segmented data
ascat.plotSegmentedData(ascat.bc)

#Run ASCAT to fit every tumor to a model, inferring ploidy, normal cell contamination, and discrete copy numbers
ascat.output <- ascat.runAscat(ascat.bc)
str(ascat.output)
plot(sort(ascat.output$aberrantcellfraction))
plot(density(ascat.output$ploidy))

# The ASCAT output is a bit cumbersome to work with, I always turn it into a matrix with 7 columns, with chromosome, start, stop and allelic copy number info:
source('organize.ascat.segments.r')
ascat.segments <- organize.ascat.segments(ascat.output, ascat.bc$SNPpos)
head(ascat.segments)


###### If you do not have paired normal samples ######
# ASCAT also allows you to analyze data where you do not have paired normals. 
# This often comes in handy.

ascat.bc <- ascat.loadData(Tumor_LogR_file='Tumor_LogR.txt', Tumor_BAF_file='Tumor_BAF.txt')
source("predictGG.R")
platform = "Affy250k_nsp"
ascat.gg = ascat.predictGermlineGenotypes(ascat.bc, platform)
ascat.bc = ascat.aspcf(ascat.bc,ascat.gg=ascat.gg)
ascat.plotSegmentedData(ascat.bc)
ascat.output = ascat.runAscat(ascat.bc)


############################
######## Exercise 3 ########
############################
#Calculate frequency of copy number gain and loss

#First, add the allele specific copy numbers to get total copy number (tcn)
ascat.segments.tcn <- cbind(ascat.segments[,1:5], ascat.segments[,6]+ascat.segments[,7])
colnames(ascat.segments.tcn)[6] <- "tcn"

#ASCAT fits its output to a model with a given ploidy, so tcn must be corrected for ploidy to determine gain and loss
#Extract ploidy calls
ploidy <- ascat.output$ploidy
names(ploidy) <- colnames(ascat.output$nA)

#Correct TCN for ploidy
ascat.segments.tcn[,6] <- ascat.segments.tcn[,6] - ploidy[match(ascat.segments.tcn[,1], names(ploidy))]
head(ascat.segments.tcn)

#Load necessary functions 
source('min.cons.freq.r')

#Now turn the matrix of segmented copy number values into a minimum consistency matrix
min.cons.segs <- min.cons.fn(ascat.segments.tcn)
head(min.cons.segs)

#Now plot frequency gained or lost across the cohort
png('frequencyPlot.png')
chr.freq.plot(min.cons.segs, main="My results", ylab="Frequency gained or lost", xlab="Chromosome (MB)", loss=-1, gain=1) 
dev.off()