#Input: (1) A binary vector of length m (binary.vec), (2) a natural number k (start.site). #Output: a binary vector of length m (output.vec) that has only one contiguous string of 1's, namely the one that #appeared in binary.vec at start.site. #Description: For convenience we write k instead of start.site. The kth entry of output.vec is #defined to be 1. Starting at j = k + 1 and continuing for all j > k, the jth entry of output.vec #is 1 if and only if (1) the jth entry of binary.vec is 1, and (2) for all l such that k <= l <= j, #the lth entry of binary.vec is 1. The case j < k is handled similarly. #Requires these functions: none. #Is called by these functions: peeling. zero.make = function(binary.vec, start.site) { #Constants and vectors used in the function m = length(binary.vec) output.vec = rep(0, m) #Define the entries of output.vec #Move to the right of start.site temp.spot = start.site temp.value = binary.vec[temp.spot] while ((temp.value > 0) && (temp.spot < m)) { output.vec[temp.spot] = 1 temp.spot = temp.spot + 1 temp.value = binary.vec[temp.spot] } if ((temp.spot == m) && (temp.value > 0)) { output.vec[temp.spot] = 1 } #Move to the left of start.site temp.spot = start.site temp.value = binary.vec[temp.spot] while ((temp.value > 0) && (temp.spot > 1)) { output.vec[temp.spot] = 1 temp.spot = temp.spot - 1 temp.value = binary.vec[temp.spot] } if ((temp.spot == 1) && (temp.value > 0)) { output.vec[temp.spot] = 1 } return(output.vec) } #Input: (1) An m x r data frame of marker data (marker.data), and (2) a four-column cytoband annotation #file (annot.file). #Output: A vector of length m that contains the cytoband location of each marker (output.vec). #Description: This function finds the cytoband location of each marker. #Requires these functions: none. #Is called by these functions: peeling. make.cytoband = function(marker.data, annot.file) { #Constants and vectors used in the function n = dim(marker.data)[1] output.vec = rep(-23, n) #Find the cytoband for each marker based on its genomic position for (i in (1:n)) { temp = as.numeric(marker.data[i, 1:2]) chrom = temp[1] position = temp[2] vec1 = as.numeric(as.numeric(annot.file[,1]) == chrom) vec2 = as.numeric(as.numeric(annot.file[,2]) <= position) vec3 = as.numeric(as.numeric(annot.file[,3]) > position) annot.row = which((vec1 + vec2 + vec3) == 3) output.vec[i] = annot.file[annot.row, 4] } #Return the output return(output.vec) } #Input: (1) an n x m data matrix (x), (2) a natural number (num.perms) #Output: A num.perms x 2 matrix (output) #Description: This function uses num.perms iterations of the cyclic shift procedure to find an empirical null #distribution for T(X) = the maximum and minimum column sum of X. The null distribution of the minimum column #sum appears in column 1 of output, whereas the null distribution for the maximum column sum appears in column #2 of output. #Requires these functions: none. #Is called by these functions: detailed.look, quick.look. find.null = function(x, num.perms, random.seed) { #Set random seed, if apppropriate if (length(random.seed) > 0) { set.seed(random.seed) } #Constants and vectors used in the function n = dim(x)[1] m = dim(x)[2] shift.vec = rep(23, num.perms) #Perform the cyclic shift procedure, and record the minimum and maximum column sum after each #cyclic shift. for (j in (1:num.perms)) { cutpoints = sample(c(1:m), size = n, replace = TRUE) perm.x = matrix(0, n, m) for (i in (1:n)) { index = cutpoints[i] if (index == 1) { second = c() } if (index > 1) { second = x[i, 1:(index - 1)] } first = x[i, index:m] perm.x[i,] = c(first, second) } perm.col.sums = colSums(perm.x) shift.vec[j] = max(colSums(perm.x)) } #Define and return the output return(shift.vec) } #Input: (1) An n x m data matrix (x), (2) an m x r data frame of marker data (marker.data), #(3) a vector of length m (cytoband) that contains the cytoband for each marker, and #(4) a natural number (k). #Output: A list containing the following items: (1) An n x m data matrix, (2) the peak interval. #Description: This function applies the peeling procedure to the input data matrix x starting at #the most aberrant marker k. #Requires these functions: zero.make. #Is called by these functions: detailed.look, quick.look. peeling = function(x, marker.data, cytoband, k) { n = dim(x)[1] m = dim(x)[2] rowmeans = rowMeans(x) grandmean = mean(rowmeans) #This section of code creates exceed, a binary n x chr.m matrix, where chr.m is the number of markers #in the chromosome containing k, the marker where the peeling procedure begins. An entry of exceed is 1 #if the corresponding entry of x belongs to the peak interval chr = marker.data[k, 1] chr.start = min(which(marker.data[,1] == chr)) chr.end = max(which(marker.data[,1] == chr)) chr.m = chr.end - chr.start + 1 chr.k = k - chr.start + 1 chr.data = x[,chr.start:chr.end] col.means = colMeans(chr.data) col.indicator = as.numeric(col.means > grandmean) col.indicator = zero.make(col.indicator, chr.k) exceed = rep(1, n) %o% col.indicator which.band = substr(cytoband[k], 1, 1) band.vec = substr(cytoband[chr.start:chr.end], 1, 1) band.ind.vec = as.numeric(band.vec == which.band) band.matrix = rep(1, n) %o% band.ind.vec exceed = exceed * band.matrix #This section of code find the markers that make up the peak interval peak.interval = as.numeric(colSums(exceed) > 0) chr.left.side = min(which(peak.interval == 1)) chr.right.side = max(which(peak.interval == 1)) left.side = chr.left.side + chr.start - 1 right.side = chr.right.side + chr.start - 1 #These will be used below columnmean = mean(chr.data[,chr.k]) #This section of code creates exceed.running, a binary n x chr.m matrix. An entry of exceed.running #is 1 if the corresponding entry of x will be peeled, otherwise it is zero. exceed.running = matrix(0, n, chr.m) exceed.running[,chr.k] = as.numeric(chr.data[,chr.k] > rowmeans) if (chr.k > 1) { for (j in (chr.k - 1):1) { exceed.running[,j] = exceed.running[,(j + 1)] * exceed[,j] * as.numeric(chr.data[,j] > rowmeans) } } if (chr.k < chr.m) { for (j in (chr.k + 1):chr.m) { exceed.running[,j] = exceed.running[,(j - 1)] * exceed[,j] * as.numeric(chr.data[,j] > rowmeans) } } #This section of code implements a multiplicative-based shrinkage approach at column not.imputed = chr.data * (1 - exceed.running) to.be.imputed = chr.data * exceed.running num.term1 = n * grandmean num.term2 = sum(not.imputed[,chr.k]) num.term3 = sum(exceed.running[,chr.k] * rowmeans) den.term = sum(to.be.imputed[,chr.k]) - num.term3 tau = (num.term1 - num.term2 - num.term3)/den.term imputed = tau * (to.be.imputed - (matrix(rowmeans, n, chr.m) * exceed.running)) + (matrix(rowmeans, n, chr.m) * exceed.running) final.matrix = imputed + not.imputed #Create the output x[,chr.start:chr.end] = final.matrix output.list = list(x, c(left.side, right.side)) #Return the output return(output.list) } #Input: (1) An n x m data matrix (x), (2) a m x r data frame of marker data (marker.data) as defined in #cytoband.function, (3) a cytoband annotation file (annot.file), (4) a natural number (num.perms), (5) a #natural number (num.iters), (6) a character string - either "gain" or "loss" - used to indicate whether #gains or losses, respectively, are analyzed, and (7) an optional random seed. #Output: A list containing two elements: (1) a num.iters x (r + 4) data frame, and (2) an two-dimensional #array of lists. See DiNAMIC_README.pdf for more information about the output. #Description: This function applies the Detailed Look procedure to the input data matrix x. Empirical #null distributions of T(X) = maximum or minimum column sum are created based on num.perms cyclic shifts. #The function finds the num.iters most aberrant markers, assesses their statistical significance, peels #them, and finds the various peak intervals. By default the function considers both the minimum and #maximum column sum (high.only = FALSE), but it can also assess the significance of only the maximum #column sum (high.only = TRUE). #Requires these functions: find.null, marker.finder, peeling, zero.make, make.cytoband. #Is called by these functions: none. detailed.look = function(x, marker.data, annot.file, num.perms, num.iters, gain.loss = "gain", random.seed = NULL) { #Constants and vectors used in the function n = dim(x)[1] m = dim(x)[2] r = dim(marker.data)[2] small.marker.data = as.matrix(marker.data[,1:2]) chrom.vec = small.marker.data[,1] cytoband = make.cytoband(small.marker.data, annot.file) marker.matrix = as.matrix(marker.data) gain.loss.ind = as.numeric(gain.loss == "gain") - as.numeric(gain.loss == "loss") #Perform the Detailed Look procedure data.matrix = x * gain.loss.ind output.matrix = matrix(23, num.iters, r + 1) colnames(output.matrix) = c(colnames(marker.matrix), "p-Value") output.list = array(list(), c(num.iters, r)) colnames(output.list) = colnames(marker.matrix) for (i in (1:num.iters)) { null.dist = find.null(data.matrix, num.perms, random.seed) col.sums = colSums(data.matrix) obs.max = max(col.sums) k = which.max(col.sums) p.val = min(mean(obs.max 2) { for (j in (3:r)) { output.list[i, j] = list(marker.matrix[interval[1]:interval[2], j]) } } } output.matrix = as.data.frame(output.matrix) output = list(output.matrix, output.list) #Return output return(output) } #Input: (1) An n x m data matrix (x), (2) a m x r data frame of marker data (marker.data) as defined in #cytoband.function, (3) a cytoband annotation file (annot.file), (4) a natural number (num.perms), (5) a #natural number (num.iters), (6) a character string - either "gain" or "loss" - used to indicate whether #gains or losses, respectively, are analyzed, and (7) an optional random seed. #Output: A list containing two elements: (1) a num.iters x (r + 4) data frame, and (2) an two-dimensional #array of lists. See DiNAMIC_README.pdf for more information about the output. #Description: This function applies the Detailed Look procedure to the input data matrix x. Empirical #null distributions of T(X) = maximum or minimum column sum are created based on num.perms cyclic shifts. #The function finds the num.iters most aberrant markers, assesses their statistical significance, peels #them, and finds the various peak intervals. By default the function considers both the minimum and #maximum column sum (high.only = FALSE), but it can also assess the significance of only the maximum #column sum (high.only = TRUE). #Requires these functions: find.null, marker.finder, peeling, zero.make, make.cytoband. #Is called by these functions: none. quick.look = function(x, marker.data, annot.file, num.perms, num.iters, gain.loss = "gain", random.seed = NULL) { #Constants and vectors used in the function n = dim(x)[1] m = dim(x)[2] r = dim(marker.data)[2] small.marker.data = as.matrix(marker.data[,1:2]) chrom.vec = small.marker.data[,1] cytoband = make.cytoband(small.marker.data, annot.file) marker.matrix = as.matrix(marker.data) gain.loss.ind = as.numeric(gain.loss == "gain") - as.numeric(gain.loss == "loss") #Perform the Detailed Look procedure data.matrix = x * gain.loss.ind output.matrix = matrix(23, num.iters, r + 1) colnames(output.matrix) = c(colnames(marker.matrix), "p-Value") output.list = array(list(), c(num.iters, r)) colnames(output.list) = colnames(marker.matrix) null.dist = find.null(data.matrix, num.perms, random.seed) for (i in (1:num.iters)) { col.sums = colSums(data.matrix) obs.max = max(col.sums) k = which.max(col.sums) p.val = min(mean(obs.max 2) { for (j in (3:r)) { output.list[i, j] = list(marker.matrix[interval[1]:interval[2], j]) } } } output.matrix = as.data.frame(output.matrix) output = list(output.matrix, output.list) #Return output return(output) } #Input: (1) A six-column matrix (seg.matrix) produced by the segmentation algorithm DNAcopy, (2) an m x r #data frame of marker data (marker.matrix). #Output: An n x m matrix. #Description: This function converts the six-column output of DNAcopy to an n x m matrix that can be #analyzed with the cyclic shift procedure. Here n represents the number of subjects, and m is the number #of markers. #Requires these functions: none. #Is called by these functions: bias.correction. make.shift.array = function(seg.matrix, marker.data) { #Remove the first column of seg.matrix seg.matrix = as.matrix(seg.matrix[,2:6]) #Find the marker where each subject's segmentation begins marker.matrix = as.matrix(marker.data[,1:2]) marker.vec = marker.matrix[,2] marker.vec = cumsum(as.numeric(marker.vec)) first.marker = marker.vec[1] #Find the number of subjects n = sum(as.numeric(as.numeric(seg.matrix[,2]) == first.marker)) #Find the number of markers m = length(marker.vec) #Find the rows of seg.matrix for each subject start.rows = which(as.numeric(seg.matrix[,2]) == first.marker) end.rows = c((start.rows[2:n] - 1), dim(seg.matrix)[1]) location.matrix = cbind(start.rows, end.rows) #Create a blank n x m matrix data.matrix = matrix(23, n, m) #Fill data.matrix one row at a time subset = seg.matrix[,4:5] for (i in (1:n)) { start = location.matrix[i, 1] stop = location.matrix[i, 2] temp.matrix = matrix(as.numeric(subset[start:stop,]), stop - start + 1, 2) temp.row = c() for (j in (1:dim(temp.matrix)[1])) { temp.row = c(temp.row, rep(temp.matrix[j, 2], temp.matrix[j, 1])) } data.matrix[i,] = temp.row } #Return the output return(data.matrix) } #Input: (1) An n x m data matrix (x), (2) an m x r data frame of marker data (marker.data). #Output: An n x m matrix. #Description: This function applies the bias-correction procedure to the input matrix x. #Requires these functions: make.shift.array. Also, the package DNAcopy is used. #Is called by these functions: none. bias.correction = function(x, marker.data) { #Load DNAcopy library(DNAcopy) #Constants and vectors used in the function marker.matrix = as.matrix(marker.data[,1:2]) chr.vec = marker.matrix[,1] start.sites = marker.matrix[,2] n = dim(x)[1] m = dim(x)[2] #Use DNAcopy to create a segmented version of x cna.first = CNA(t(x), chr.vec, start.sites, data.type = "logratio") seg.first = segment(cna.first) output.first = seg.first$output num.cols = dim(output.first)[2] seg.values.first = as.numeric(output.first[,num.cols]) output.first[,num.cols] = seg.values.first segmented.first = make.shift.array(output.first, marker.data) #Estimate and remove the bias resid = x - segmented.first bias.est = colMeans(resid) bias.matrix = matrix(bias.est, n, m, byrow = TRUE) adjusted.matrix = x - bias.matrix #Create the bias-corrected version of x cna.second = CNA(t(adjusted.matrix), chr.vec, start.sites, data.type = "logratio") seg.second = segment(cna.second) output.second = seg.second$output seg.values.second = as.numeric(output.second[,num.cols]) output.second[,num.cols] = seg.values.second segmented.second = make.shift.array(output.second, marker.data) #Return the output return(segmented.second) }