# R course on copy numbers using ASCAT and biomaRt ####################################################################### ##### Exercise 1: Paired parent-specific copy number segmentation ##### ####################################################################### #Navigate to the Affymetrix directory #Start an R session library("aroma.affymetrix"); verbose <- Arguments$getVerbose(-10, timestamp=TRUE); #The data library holds a subset of the public dataset GSE12702, available at GEO dataSet <- "GSE12702"; chipType <- "Mapping250K_Nsp"; # Load all samples csRall <- AffymetrixCelSet$byName(dataSet, chipType=chipType); # The data includes two tumor-normal pairs # Extract tumor-normal pair of interest (pick ONE of these two lines) pair <- c(T="GSM318734", N="GSM318735"); pair <- c(T="GSM318736", N="GSM318737"); # T is the tumor, N is the matched normal #Now we extract the pair of interest from our cel set csR <- extract(csRall, indexOf(csRall, pair)); print(csR); #Allele-specific copy-number estimates #Here we will use an allele-specific version of CRMAv2 to estimate total CNs (TCNs) and allele B fractions (BAFs) from the two CEL files. #AS-CRMAv2 will take care of signal normalization etc: res <- doASCRMAv2(csR, verbose=verbose); #This will take a couple of minutes per array. #Results are saved to disk, so if you need to work with this data in the future, Aroma will just load the normalized data. #Extracting PSCN estimates for the tumor-normal pair #Next we need to extract TCNs and BAFs for the tumor and the matched normal: # Extract (total,beta) estimates for the tumor-normal pair data <- extractPSCNArray(res$total); dimnames(data)[[3]] <- names(pair); str(data); #Actually, at this stage the TCNs have not been standardized toward a reference. #Here we will calculate TCNs as C = 2*T/N where T and N are the total "CN" signals for the tumor and the matched normal, respectively. # Total CNs for the tumor relative to the matched normal CT <- 2 * (data[,"total","T"] / data[,"total","N"]); # Allele B fractions for the tumor betaT <- data[,"fracB","T"]; # Allele B fractions for the normal betaN <- data[,"fracB","N"]; #What is remaining is to get the genomic locations for these data points: # Get (chromosome, position) annotation data ugp <- getAromaUgpFile(res$total); chromosome <- ugp[,1,drop=TRUE]; x <- ugp[,2,drop=TRUE]; #We now have all the PSCN data we need: # Setup data structure for Paired PSCBS df <- data.frame(chromosome=chromosome, x=x, CT=CT, betaT=betaT, betaN=betaN); #Parent-specific copy-number segmentation #We next use Paired PSCBS to segment the above PSCN estimates: library("PSCBS"); fit <- segmentByPairedPSCBS(df, verbose=verbose); #Note that this may take several minutes (per tumor-normal pair). #To access the table of identified sements, use the getSegments() method, which returns a data.frame (so it can easily be written to file): segs <- getSegments(fit); print(segs); #For more details on Paired PSCBS options, see help("segmentByPairedPSCBS", package="PSCBS"). #We can plot the TCN, the decrease-of-heterozygosity (DH), and the minor-major CN estimates as follows: pairName <- paste(pair, collapse="vs"); chrTag <- sprintf("Chr%s", seqToHumanReadable(getChromosomes(fit))); #Using toPNG(), images files are by default saved in figures/. toPNG(pairName, tags=c(chrTag, "PairedPSCBS"), width=840, aspectRatio=0.6, { plotTracks(fit); }); #We can further call regions of LOH and by bootstrapping fit.LOH <- bootstrapTCNandDHByRegion(fit, verbose=verbose) deltaAB <- estimateDeltaAB(fit.LOH, flavor="qq(DH)", verbose=verbose) fit.LOH <- callAB(fit.LOH, delta=deltaAB, verbose=verbose) deltaLOH <- estimateDeltaLOH(fit.LOH, flavor="minC1|nonAB", verbose=verbose) fit.LOH <- callLOH(fit.LOH, delta=deltaLOH, verbose=verbose) toPNG(pairName, tags=c(chrTag, "PairedPSCBS.LOH"), width=840, aspectRatio=0.6, { plotTracks(fit.LOH); }); ###################################################################### ###################### Exercise 2: ASCAT analysis #################### ###################################################################### #Navigate to the ASCAT directory #Start an R session source("ascat.R") #Load the data ascat.bc <- ascat.loadData(Tumor_LogR_file='Tumor_LogR.txt', Tumor_BAF_file='Tumor_BAF.txt', Germline_LogR_file='Germline_LogR.txt', Germline_BAF_file='Germline_BAF.txt') #Plot the raw data ascat.plotRawData(ascat.bc) #Segment the data ascat.bc <- ascat.aspcf(ascat.bc) #Plot the segmented data ascat.plotSegmentedData(ascat.bc) #Run ASCAT to fit every tumor to a model, inferring ploidy, normal cell contamination, and discrete copy numbers ascat.output <- ascat.runAscat(ascat.bc) str(ascat.output) plot(sort(ascat.output$aberrantcellfraction)) plot(density(ascat.output$ploidy)) # The ASCAT output is a bit cumbersome to work with, I always turn it into a matrix with 7 columns, with chromosome, start, stop and allelic copy number info: source('organize.ascat.segments.r') ascat.segments <- organize.ascat.segments(ascat.output, ascat.bc$SNPpos) head(ascat.segments) ###### If you do not have paired normal samples ###### # ASCAT also allows you to analyze data where you do not have paired normals. # This often comes in handy. ascat.bc <- ascat.loadData(Tumor_LogR_file='Tumor_LogR.txt', Tumor_BAF_file='Tumor_BAF.txt') source("predictGG.R") platform = "Affy250k_nsp" ascat.gg = ascat.predictGermlineGenotypes(ascat.bc, platform) ascat.bc = ascat.aspcf(ascat.bc,ascat.gg=ascat.gg) ascat.plotSegmentedData(ascat.bc) ascat.output = ascat.runAscat(ascat.bc) ############################ ######## Exercise 3 ######## ############################ #Calculate frequency of copy number gain and loss #First, add the allele specific copy numbers to get total copy number (tcn) ascat.segments.tcn <- cbind(ascat.segments[,1:5], ascat.segments[,6]+ascat.segments[,7]) colnames(ascat.segments.tcn)[6] <- "tcn" #ASCAT fits its output to a model with a given ploidy, so tcn must be corrected for ploidy to determine gain and loss #Extract ploidy calls ploidy <- ascat.output$ploidy names(ploidy) <- colnames(ascat.output$nA) #Correct TCN for ploidy ascat.segments.tcn[,6] <- ascat.segments.tcn[,6] - ploidy[match(ascat.segments.tcn[,1], names(ploidy))] head(ascat.segments.tcn) #Load necessary functions source('min.cons.freq.r') #Now turn the matrix of segmented copy number values into a minimum consistency matrix min.cons.segs <- min.cons.fn(ascat.segments.tcn) head(min.cons.segs) #Now plot frequency gained or lost across the cohort png('frequencyPlot.png') chr.freq.plot(min.cons.segs, main="My results", ylab="Frequency gained or lost", xlab="Chromosome (MB)", loss=-1, gain=1) dev.off()