# Long Lab SNP Genotype Calling Function # # EMAIL: sjm@uci.edu # # DATE: 10 July 2004 # # NEED: R (www.R-project.org), including ellipse and splancs # packages (cran.r-project.org) ######################## # README ######################## # # Basic Description: # # 1. Takes in an input file holding intensity data from each allele of # a number of SNPs, plots the intensity of allele-a against allele-b, # and prompts the user to define the 3 genotype clusters (a/a, a/b, # and b/b) # # 2. A bivariate normal likelihood function then assigns individuals to # genotype classes on the basis of these user-defined clusters # # 3. Outputs a plot, coloured by the assigned genotype calls, a file # holding the assigned genotype calls, and a file of the confidence # in this call # # Input File Requirements: # # 1. Must be a tab-delimited *.txt file # # 2. Need an indicator column called "plate", holding a 1:N vector # indicating which PCR plate each individual is from # # 3. Need an indicator column called "blanks", holding a 2 for blanks, # and a 1 otherwise (on cluster-defining plot, blanks = red, and # non-blanks = black) # # 4. Input files should not hold too many SNPs. Software can cope with any # number, but currently must move from first to last SNP in file: it is # not possible to start calling from an arbitrary SNP # # Calling Genotypes: # # 1. Clusters must be defined by user in the order specified by the program # # 2. Avoid pressing keys other than '0', '1', '3', and '99' during calling, # as this can have unpredictable results # # 3. Software hard-wired to expect 3 clusters. If there are only two (due to # absent rare homozygote class), MUST draw a polygon on the device # WITHOUT encompassing points, and this empty polygon MUST be drawn in # the sequence expected by the software # # Important Requirements # # 1. functions from the 'ellipse' and 'splancs' packages are required, # and are not included in the standard R distribution, so must be # installed by the user # # 2. 'cluster.genotype' function needs to open a graphics device on the # screen, which is platform dependent, so the user should comment in # the appropriate line of code (at top of 'cluster.genotype' function) # # Output File Details: # # 1. GENOTYPE DATA FILE (tab-delimited *.txt) # -> holds same indicator columns as in input file # -> holds a single column for each SNP, named as the first allele of # of the SNP from the input file # -> meaning of genotype calls is as follows: NA = point not plotted # (likely intensity data from one allele very low), 6 = SNP could not # be called, 5 = point was not given a genotype call, 1 = a/b # heterozygote, 2 = b/b homozygote, 3 = a/a homozygote # # 2. CONFIDENCE DATA FILE (tab-delimited *.txt) # -> holds same indicator columns as in input file # -> holds a single column for each SNP, named as the first allele of # of the SNP from the input file # -> value is the confidence in the assigned genotype call # # 3. PLOT FILE (*.pdf) # -> shows coloured intensity plot for each SNP, with points colour- # coded by assigned genotype # -> colours are as follows: b/b homozygote - genotype 2 = RED, # a/b heterozygote - genotype 1 = BLACK, a/a homozygote - genotype 3 = # GREEN, uncalled - genotype 5 = CYAN ######################## # After pasting in functions, and changing R default directory to # that holding input file, simply copy following command into window: genotype.snps() ################################################################################# ################################################################################# ### ### ### ### ### PASTE FUNCTIONS BELOW INTO R WINDOW ### ### ### ### ### ################################################################################# ################################################################################# ######################## # FUNCTION genotype.snps ######################## # DESCRIPTION Calls functions allowing genotypes to be called # -> enter filename and number of first column holding SNP data # -> pass data through 'log.snp.data function # -> pass logged data through 'cluster.genotype' function ######################## genotype.snps <- function() { # input filename and the number of the first column holding # SNP intensity data print("ENTER name of datafile, excluding extension: ",quote=F) input.file <- scan(what="character",nmax=1) print("ENTER number of first data column: ",quote=F) first.data.col <- as.numeric(scan(what="numeric",nmax=1)) # read in datafile # call 'log.snp.data' function to natural log data, and save interim file intensity.dat <- read.delim(paste(input.file, "txt", sep = "."), header = TRUE) last.col.predict <- first.data.col - 1 num.snp <- (ncol(intensity.dat) - last.col.predict)/2 intensity.dat <- log.snp.data(input.file, intensity.dat, last.col.predict) write.table(intensity.dat, file = paste(input.file, "log", "txt", sep = "."), col.names = TRUE, row.names = FALSE, quote = FALSE, sep = " ") # prepare to send data through 'cluster.genotype' function num.plate <- 1 plate.indic <- rep(1,nrow(intensity.dat)) plot.out <- paste(input.file, "plot", "pdf", sep = ".") genotype.out <- paste(input.file, "genotype", "txt", sep = ".") confidence.out <- paste(input.file, "confidence", "txt", sep = ".") cluster.genotype(intensity.dat, num.snp, last.col.predict, plate.indic, num.plate, plot.out, genotype.out, confidence.out) } ######################## # FUNCTION log.snp.data ######################## # DESCRIPTION takes natural log of data in 'intensity.dat' # -> prior to log replaces values equal to/below 1 with NA ######################## log.snp.data <- function(input.file, intensity.dat, last.col.predict) { tmp.matrix <- as.matrix(intensity.dat[,(last.col.predict+1):ncol(intensity.dat)]) tmp.matrix[tmp.matrix <= 1] <- NA tmp.log.matrix <- log(tmp.matrix) intensity.dat <- cbind(intensity.dat[,1:last.col.predict], as.data.frame(tmp.log.matrix)) } ######################## # FUNCTION cluster.genotype ######################## # DESCRIPTION calls genotype of each individual for each SNP # -> plots a- vs. b-allele for each SNP, and asks user to # define the 3 clusters (a/b-HET, b/b-HOMO, a/a-HOMO) # -> bivariate normal pdf applied giving likelihood of a point # being in a given cluster # -> data exported to 3 output files: a genotype table, a # table of confidence values, and a colour-coded genotype plot ######################## cluster.genotype <- function(intensity.dat, num.snp, last.col.predict, plate.indic, num.plate, plot.out, genotype.out, confidence.out) { library(ellipse) library(splancs) graphics.off() pdf(file = plot.out, paper = "letter") # For Windoze versions change comment in: # windows(width=8, height=8) # For Mac OS X comment in: quartz(width=8, height=8) # For UNIX X11 comment in: # x11(width=8, height=8) genotype.dat <- intensity.dat[plate.indic == num.plate, 1:last.col.predict] confidence.dat <- intensity.dat[plate.indic == num.plate, 1:last.col.predict] snp.vector <- 1:num.snp #OPEN# Loop through SNPs for(j in snp.vector) { snpj.name <- colnames(intensity.dat)[last.col.predict+(2*j)-1] snpj.dat <- as.points(intensity.dat[,last.col.predict+(2*j)-1], intensity.dat[,last.col.predict+(2*j)]) min.plot.axis <- 0 good.data <- 3 num.clust <- 3 #OPEN# Draw a- vs. b-allele plot and zoom while(good.data == 3) { # ARGUMENTS TO 'plot' # 'probs' argument set to 0 via 'min.plot.axis' above, so initial # value for X(Y)-axis is the lowest value of X(Y); if user hits # hits 3 - zoom - 'min.plot.axis' increments by 0.02, and plot # loses lowest 2% of points # max value of X(Y)-axis is set 5% higher than the highest value of X(Y) # known blanks in plot are red [2], while rest of points are black [1], # which requires an appropriately coded "blanks" column in file snpj.plot <- cbind(snpj.dat, intensity.dat["blanks"]) plot(snpj.plot[,1], snpj.plot[,2], pch = 20, col = snpj.plot$blanks, xlab = "probe a", ylab = "probe b", xlim = c(quantile(snpj.dat[,1], probs = min.plot.axis, na.rm = TRUE), (max(snpj.dat[,1], na.rm = TRUE)*1.05)), ylim = c(quantile(snpj.dat[,2], probs = min.plot.axis, na.rm=TRUE), (max(snpj.dat[,2], na.rm = TRUE)*1.05)), main = snpj.name) print("Do you wish to call SNP? (Yes = 1, No = 0, Zoom = 3, Exit = 99)", quote = FALSE) good.data <- scan(nmax = 1) min.plot.axis <- min.plot.axis + 0.02 } #CLOSE# Plot/zoom loop #OPEN# Once user has determined SNP is good if(good.data == 1) { # following list files allow each cluster to use a similarly named # data object, indexed by the cluster (k) clust.dat <- list() clust.mean <- list() clust.covar <- list() ci.ellipse <- list() BVN <- list() #OPEN# Loop through clusters for(k in 1:num.clust) { if(k == 1) { print("Draw polygons about the 3 clusters:", quote = FALSE) print("---> click points with mouse", quote = FALSE) print("---> use 'escape' to finish on quartz device", quote = FALSE) print("---> use 'return' to finish on X11/Windoze device", quote = FALSE) print("---> polygon closes automatically", quote = FALSE) print("", quote = FALSE) print("Draw polygon about a/b heterozygote cluster.", quote = FALSE) } if(k == 2) { print("", quote = FALSE) print("Draw polygon about b/b homozygote cluster.", quote = FALSE) } if(k == 3) { print("", quote = FALSE) print("Draw polygon about a/a homozygote cluster.", quote = FALSE) } # allows user to draw polygon on plot clust.polygon <- getpoly(quiet = TRUE) # (1) # move rows of 'snpj.dat' for which neither value is 'NA' to # 'snpj.dat.trunc' # (2) # test whether points from 'snpj.dat.trunc' are within the user # defined polygon ('inout' function) # -> 'within.polygon' = vector of TRUE (within polygon)/FALSE # (outside polygon) # -> 'bound' argument determines fate of points falling exactly # on polygon boundary: NULL = arbitrary assigment, TRUE = # assigned within, FALSE = assigned outside # (3) # 'within.clust' = vector of FALSE, with length 'nrow(snpj.dat)' # (4) # assess 'snpj.dat' for those rows where neither value is 'NA', and # when satisfied, replace FALSE in 'within.clust' with statement # from 'within.polygon' # -> provided 'within.clust' and 'within.polygon' retain values # in the same order, referencing 'snpj.dat' keeps the vectors # consistent despite length differences due to 'NA' removal # -> note that no distinction is made between points falling # outside the polygon (FALSE) and those which are not plotted # as one/both values is 'NA' (FALSE) # (5) # for points within the drawn polygon data is read into 'clust.dat' # from 'snpj.dat' snpj.dat.trunc <- snpj.dat[(!is.na(snpj.dat[,1])) &(!is.na(snpj.dat[,2])),] within.polygon <- inout(snpj.dat.trunc, clust.polygon, bound = TRUE) within.clust <- rep(FALSE,nrow(snpj.dat)) within.clust[(!is.na(snpj.dat[,1]))&(!is.na(snpj.dat[,2]))] <- within.polygon clust.dat[[k]] <- snpj.dat[within.clust == TRUE,] # (1) # use 'apply' to calculate the centroid [mean.X,mean.Y] of # points in 'clust.dat' (within the polygon) # -> function takes mean of each column (MARGIN = 2) # -> then transpose the means from a vector to a 1R x 2C matrix # (2) # problem if single point defines cluster, as the single xy-coord # can be incorrectly read in as a vector, and an average taken over # both values: prevent by calling 'matrix' function to assign X- & # Y-values to separate columns # (3) # 'var' generates variance-covariance matrix for cluster # (4) # since MUST be able to draw 3 clusters (even if no minor homozygote # individual is present), if there are no points within the cluster # 'clust.dat' is assigned values of [0,0] to prevent 'NA' values # being passed to centroid and covar matrix routines if(length(clust.dat[[k]]) == 2) clust.dat[[k]] <- matrix(clust.dat[[k]],ncol=2) if(length(clust.dat[[k]]) == 0) clust.dat[[k]] <- cbind(0,0) clust.mean[[k]] <- t(apply(clust.dat[[k]], MARGIN = 2, FUN = mean)) clust.covar[[k]] <- var(clust.dat[[k]]) } #CLOSE# Loop through clusters # ideally a var-covar matrix is defined separately for each cluster, # BUT clusters of few points will give poor variance estimates # (a) # following loop checks whether clusters are made up of <= 25 # points (NOTE: arbitrary value) # -> if yes, var-covar matrix for this small cluster is replaced # by matrix from the larger of the other two clusters # -> if no, var-covar matrix for a given cluster is defined by the # points within that cluster # (b) # loop works using the indexed list files, _only_ due to 'clust.seq' # (= 1 2 3 1 2 3), as then the k, [k+1], & [k+2] values index this # object, and values > 3 changed to values <= 3 # -> without this, k = 1 is OK, but get errors with k = 2 or 3 # (when k = 3, [k+1] = 4, and [k+2] = 5, but k can only take # values from 1 to 3) clust.seq <- c(1:num.clust, 1:num.clust) #OPEN# Loop through clusters for(k in 1:num.clust) { if(nrow(clust.dat[[clust.seq[k]]]) <= 25) { if(nrow(clust.dat[[clust.seq[k+1]]]) > nrow(clust.dat[[clust.seq[k+2]]])) { clust.covar[[clust.seq[k]]] <- clust.covar[[clust.seq[k+1]]] } else clust.covar[[clust.seq[k]]] <- clust.covar[[clust.seq[k+2]]] } } #CLOSE# Loop through clusters #OPEN# Loop through clusters for(k in 1:num.clust) { # 'ellipse' package used to make a 99% CI ellipse around each cluster # centered on 'clust.mean', based on var-covar matrix of points # in cluster ci.ellipse[[k]] <- ellipse(clust.covar[[k]], centre = clust.mean[[k]], level = 0.99, npoints = 100) # (1) # make matrix (length = 'nrow(snpj.dat)') holding 'clust.mean' # in every row, and create 'clust.resids' by subtracting these # means from each row of 'snpj.dat' # (2) # 'solve' returns inverse of the var-covar matrix # (3) # 'det' gives determinant of matrix by QR decomposition (no # difference - in this case - from finding with eigenvalues) # (4) # apply formula to each row (MARGIN = 1) of 'clust.resids' to # generate likelihood that each point is a member of the same # joint bivariate normal pdf clust.resids <- snpj.dat - matrix(rep(clust.mean[[k]], nrow(snpj.dat)), byrow = TRUE, ncol = 2) inv.clust.covar <- solve(clust.covar[[k]]) constant <- sqrt(det(inv.clust.covar))/(2.0 * pi) BVN[[k]] <- apply(clust.resids, MARGIN = 1, function(x) constant*exp(-0.5 * x %*% inv.clust.covar %*% x)) } #CLOSE# Loop through clusters # (1) # 'BVN.all' holds a column for the likelihood of each point being in each # cluster # 'max.like.clust' holds the max likelihood value for each point # 'sum.like.clust' holds the sum of the 3 values for each point # (2) # need vector indicating to which cluster a given point belongs on the # basis of the likelihood values # -> 'clust.member' takes the 3 values for a point, codes according # to column of origin (1, 2, 3), sorts codes in descending # order across the row, and returns column [1] which holds # column code of the highest likelihood # -> NOTE: in the extremely unlikely case of a tie (2 or 3 identical # likelihoods on a row), the value entered for 'clust.member' # is determined by which column the tied values belong to: order # of precedence is col.1 > col.2 > col.3 (i.e. a/b HET > b/b HOMO # > a/a HOMO). Considered so unlikely - given num of sig figs - that # no safeguards are in place BVN.all <- cbind(BVN[[1]], BVN[[2]], BVN[[3]]) max.like.clust <- apply(BVN.all, MARGIN = 1,FUN = max) sum.like.clust <- apply(BVN.all, MARGIN = 1,FUN = sum) clust.member <- apply(BVN.all, MARGIN = 1,function(x) order(-x)[1]) # must ensure likelihood of being in ANY cluster is above some # threshold (3 low values suggest point cannot be called), and that # the max likelihood for a point is sufficiently high (to have some # confidence it is in a given cluster) # -> 'clust.indic' returns TRUE provided the max likelihood is at # least 95% of the sum of the likelihoods, AND is above 0.0005 # -> TRUE/FALSE then multiplied by the value (1:3) in 'clust.member', # so points that do not satisfy these criteria become 0 (and # changed to 5 for plotting purposes) clust.indic <- (max.like.clust/sum.like.clust > 0.95 & max.like.clust > 0.0005) * clust.member clust.indic <- clust.indic + (1-(max.like.clust/sum.like.clust > 0.95 & max.like.clust > 0.0005)) * 5 # plot ln(SNP) data and colour-code by classification: 1 = a/b HET # (black), 2 = b/b HOMO (red), 3 = a/a HOMO (green3), 5 = no call (cyan) # -> axis limits set to be 5% lower than the min value of X(Y) to # 5% higher than the max value of X(Y) # -> 'dev.set(dev.prev())' ensures plots sent to file rather than screen dev.set(dev.prev()) plot(snpj.dat, xlab = "probe a", ylab = "probe b", col = clust.indic, main = snpj.name, pch = ".", xlim = c((min(snpj.dat[,1], na.rm = TRUE)*0.95), (max(snpj.dat[,1], na.rm = TRUE)*1.05)), ylim = c((min(snpj.dat[,2], na.rm = TRUE)*0.95), (max(snpj.dat[,2], na.rm = TRUE)*1.05))) # comment out following 3 lines to eliminate 99% confidence ellipses # (blue [4]) around each cluster in output plots lines(ci.ellipse[[1]], lty = 5, lwd = 0.5, col = 4) lines(ci.ellipse[[2]], lty = 5, lwd = 0.5, col = 4) lines(ci.ellipse[[3]], lty = 5, lwd = 0.5, col = 4) dev.set(dev.prev()) # concatenate the genotype calls ('clust.indic') with the confidence # level of the genotypes (max.like.clust/sum.like.clust), and name the # columns using 'snpj.name' # -> for 1st SNP (j == 1) want to attach this 'calls' data to # 'genotype.dat' (initially holds just the indicator cols from # 'intensity.dat'); resultant object = 'calls.full' # -> for each subsequent SNP (j != 1), want to attach the 'calls' data # to 'calls.full' calls <- cbind(clust.indic) colnames(calls) <- paste("Call", snpj.name, sep = ".") conf <- cbind(max.like.clust/sum.like.clust) colnames(conf) <- paste("Conf",snpj.name,sep=".") if(j == 1) { calls.full <- cbind(genotype.dat, calls) conf.full <- cbind(confidence.dat, conf) } if(j != 1) { calls.full <- cbind(calls.full, calls) conf.full <- cbind(conf.full, conf) } } #CLOSE# Finished analysis of the good data #OPEN# When data is bad if(good.data == 0) { # an uncalled/bad SNP produces a col of '6's in genotype table # and a col of '0's in confidence table calls <- cbind(rep(6, nrow(snpj.dat))) colnames(calls) <- paste("Call", snpj.name, sep = ".") conf <- cbind(rep(0, nrow(snpj.dat))) colnames(conf) <- paste("Conf",snpj.name,sep=".") if(j == 1) { calls.full <- cbind(genotype.dat, calls) conf.full <- cbind(confidence.dat, conf) } if(j != 1) { calls.full <- cbind(calls.full, calls) conf.full <- cbind(conf.full, conf) } } #CLOSE# Finished analysis of the bad data #OPEN# If don't want to continue analysis if(good.data == 99) { # writes the data collected so far to file, but as yet there is # no way to restart calling at an arbitrary SNP in the input file write.table(calls.full, file = genotype.out, col.names = TRUE, row.names = FALSE, quote = FALSE, sep = " ") write.table(conf.full, file = confidence.out, col.names = TRUE, row.names = FALSE, quote = FALSE, sep = " ") graphics.off() stop("Exit program early") } #CLOSE# } #CLOSE# Loop through SNPs write.table(calls.full, file = genotype.out, col.names = TRUE, row.names = FALSE, quote = FALSE, sep = " ") write.table(conf.full, file = confidence.out, col.names = TRUE, row.names = FALSE, quote = FALSE, sep = " ") graphics.off() }