## Header section ## ---- rm(list=(ls())) setwd("~/Downloads/CURRENT/") library("ggplot2") Files<-dir(pattern='*.txt') ## Funtions ## ---- Tags <- function(data, flip=F){ TagCount <- sum(grepl(".corrected",colnames(data))) # Determines number of reporters targetList <- c("P02769","P02666","P00004", "P68082","P00761") Protein <- rep(targetList,each=TagCount) Dilution <- c() ## Sets type, iTRAQ or TMT ifelse(TagCount == 8, c(TagType <- "iTRAQ", TagLabels <- rep(c(113:119,121),5), Values <- c(2,1,0.8,3,3,0.8,1,2, 0.8,0.8,1,1,2,2,3,3, 1,2,3,0.8,0.8,3,2,1, 3,3,2,2,1,1,0.8,0.8, 1.7,1.7,1.7,1.7,1.7,1.7,1.7,1.7), ifelse(flip,Dilution <- rep(c(1,1,0.5,0.5,0.2,0.2,0.1,0.1),5),Dilution <- rep(rev(c(1,1,0.5,0.5,0.2,0.2,0.1,0.1)),5)) ), ifelse(TagCount == 6, c(TagType <- "TMT", TagLabels <- rep(126:131,5), Values <- c(2,2,3,3,0.8,0.8, 11.6/6,11.6/6,11.6/6,11.6/6,11.6/6,11.6/6, 0.8,0.8,2,2,3,3, 3,3,0.8,0.8,2,2, 11.6/6,11.6/6,11.6/6,11.6/6,11.6/6,11.6/6), ifelse(flip,Dilution <- rep(c(0.1,0.5,1,0.1,0.5,1), 5),Dilution <- rep(rev(c(0.1,0.5,1,0.1,0.5,1)), 5)) ), stop("Problem with isobaric tag headers") ) ) ## Reporter indicies in data CorrectedLabelIndex <- rep(grep(".corrected",colnames(data)),5) RawLabelIndex <- CorrectedLabelIndex + TagCount ## Outputs a dataframe with all key information output <- data.frame( TagType, TagCount, TagLabels, CorrectedLabelIndex, RawLabelIndex, Protein, Values, Dilution, stringsAsFactors = F ) return(output) } Corrected.new <- function(prot, data, raw=NULL){ index <- grep(prot, data$Proteins) ifelse( is.null(raw), dat <- data[index,unique(Tags(data)$CorrectedLabelIndex)], dat <- data[index,unique(Tags(data)$RawLabelIndex)] ) output <- dat/rowSums(dat) return(output) } Trim <- function(name){ dat <- sub(".evidence.txt","",name) dat <- strsplit(dat, "_") dat <- data.frame(dat, stringsAsFactors = F) output <- dat } FlattenData <- function(data, targetList, flip=F, raw=NULL, dil=F){ EXT <- c() # Establish the EXT (extender) variable attach(Tags(data,flip)) # attach the tags data for(prot in targetList){ # for each protein, prot dat <- stack(Corrected.new(prot, data, raw)) # dat is the stacked form of Corrected.new, making it values and indices (labels) ifelse(dil, # generate tag data, if the data is diltured, multiply the values by the dilution factor tagDat <- (Tags(data, flip)[Protein == prot,]$Values * Tags(data, flip)[Protein == prot,]$Dilution), tagDat <- Tags(data, flip)[Protein == prot,]$Values ) # Change levels of factor to expected values levels(dat$ind) <- tagDat/sum(tagDat) # Collect reciprocal, for scaling data to readable values BASE_SCALE = 1/(min(tagDat/sum(tagDat))) # Convert levels of factor to numeric values dat$ind <- as.numeric(levels(dat$ind))[dat$ind] ## # Here I need to apply the conversion back up into rational numbers so readers can understand dat <- dat*BASE_SCALE ## # Binds the protein name, so individual proteins can be pulled out and identified dat <- cbind(dat,prot) # Extends the list of output data EXT <- rbind.data.frame(EXT,dat) } detach() output <- EXT } RemoveZeros <- function(data, flip=F, raw=NULL){ ifelse( is.null(raw), dat <- data[,unique(Tags(data, flip=F)$CorrectedLabelIndex)], dat <- data[,unique(Tags(data)$RawLabelIndex)] ) # dat <- data[,unique(Tags(data)$CorrectedLabelIndex)] dat[dat == 0]<- NA index <- complete.cases(dat) output <- data[index,] } PeptideQuants <- function(){ mergeProt <- paste(targetList, collapse = "|") Files<-dir(pattern='*.txt') tmpChain <- c() for(FILE in Files){ data <- read.table(FILE, sep = "\t", stringsAsFactors = F, header = T) allPepts <- nrow(data) quanPept <- nrow(RemoveZeros(data)) targetPepts <- nrow(Corrected.new(mergeProt, data)) quanTarget <- nrow(Corrected.new(mergeProt, RemoveZeros(data))) prots <- length(unique(data$Leading.razor.protein)) fractions <- max(data$Fraction) tmpChain <- rbind(tmpChain, c(allPepts,quanPept,targetPepts, quanTarget, prots, fractions)) } row.names(tmpChain) <- Files colnames(tmpChain) <- c("Identified MSMS spectra", "Quantified MSMS spectra", "Identified target Spectra", "Quantified target Spectra", "Proteins", "No. of Fractions") output <- as.data.frame(tmpChain) } ScalingMatrix <- function(data, flip=F, dil=F){ data <- RemoveZeros(data) # Strip zeros from the dataset scalingMatrix <- c() # establish the scalingMatrix variable dat <- Tags(data, flip) # simplify calls by creating the dat variable, containing the tag information for(PROT in targetList){ # For each protein, PROT ifelse(dil, # generate tag data, if the data is diltured, multiply the values by the dilution factor tagDat <- (dat[dat$Protein == PROT,]$Values * dat[dat$Protein == PROT,]$Dilution), tagDat <- dat[dat$Protein == PROT,]$Values ) for(LABEL in seq(length(unique(Tags(data)$TagLabels)))){ # and for each label, LABEL # target is the concentration of PROT for each label/sum(PROT for each label), indexed by label target <- (tagDat/sum(tagDat))[LABEL] # The scalingMatrix builds sequentially, adding one label at a time # It takes the median value for the observed data # !!!which needs to be corrected!!! ## !!! Pull out the Corrected.new part, put it into a separate variable, apply the dilution, use the variable instead! # Lists the label being referenced # and the protein, # then gives the target value # and then the target divided by the median, # to centralise the median of the data distribution in the observed data around the target point (correction) scalingMatrix <- rbind(scalingMatrix, c(median(Corrected.new(PROT, data)[,LABEL]), unique(dat$TagLabels)[LABEL], PROT, unique(dat$TagType) , target , target/median(Corrected.new(PROT, data)[,LABEL]) ) ) } # Fix formatting into a dataframe, establish names, and make data numeric instead of strings (formatting issue) scalingMatrix <- as.data.frame(scalingMatrix, stringsAsFactors = F) names(scalingMatrix) <- c("Median", "Label", "Protein", "Tag", "TargetValue", "ScalingFactor") scalingMatrix$ScalingFactor <- as.numeric(scalingMatrix$ScalingFactor) scalingMatrix$Label <- as.numeric(scalingMatrix$Label) } output <- scalingMatrix } ScaleData <- function(data, targetList, scale_Data=NULL, flip=F, raw=NULL, dil=F){ EXT <- c() attach(Tags(data,flip)) for(prot in targetList){ dat <- stack(Corrected.new(prot, data, raw)) # Checks to see if the sample needs to be diluted, if so multiplies values by dilution ifelse(dil, tagDat <- (Tags(data, flip)[Tags(data,flip)$Protein == prot,]$Values * Tags(data, flip)[Tags(data,flip)$Protein == prot,]$Dilution), tagDat <- Tags(data, flip)[Tags(data,flip)$Protein == prot,]$Values ) # Generate a scaling matrix - checks to see if a separate dataset has been provided to scale against then applies correction if(is.null(scale_Data)) scale_Data <- data; NORMAL_SCALE <- ScalingMatrix(scale_Data)[ScalingMatrix(scale_Data)$Protein==prot,]$ScalingFactor SCALING_VECTOR <- dat$ind levels(SCALING_VECTOR) <- NORMAL_SCALE SCALING_VECTOR <- as.numeric(levels(SCALING_VECTOR))[SCALING_VECTOR] dat$values <- dat$values * SCALING_VECTOR # Change levels of factor to expected values levels(dat$ind) <- tagDat/sum(tagDat) # Collect reciprocal, for scaling data to readable values BASE_SCALE = 1/(min(tagDat/sum(tagDat))) # Convert levels of factor to numeric values dat$ind <- as.numeric(levels(dat$ind))[dat$ind] ## # Here I need to apply the conversion back up into rational numbers so readers can understand dat <- dat*BASE_SCALE ## # Binds the protein name, so individual proteins can be pulled out and identified dat <- cbind(dat,prot, SCALING_VECTOR) # Extends the list of output data EXT <- rbind.data.frame(EXT,dat) } detach() output <- EXT } ## Protein Index ## ---- targetList <- c("P02769","P02666","P00004", "P68082") protNames <- c("BSA", "B Casein", "Cytochrome C", "Myoglobin") protTable <- new.env() for(i in seq(length(targetList))){ protTable[[ targetList[i] ]] <- protNames[i] } write.csv(PeptideQuants(),"TableOfQuants.csv") pdf("test.4.pdf") for(File in Files){ data <- read.table(File, sep = "\t", stringsAsFactors = F, header = T) p <- ggplot(FlattenData(data, targetList, flip=as.numeric(Trim(File)[3,1]), dil = as.numeric(Trim(File)[2,1])), aes(x=ind, y=values, colour=prot), na.rm=T) + geom_point(shape=1,position=position_jitter(width=0.005), alpha = 0.2, na.rm=T) + geom_abline(intercept=0, slope=1) + stat_smooth(method = "glm", formula = y~x, na.rm=T) + stat_smooth(mapping = aes(ind, values, colour = "General Trend"), method = "glm", color="black", linetype=3, na.rm=T) + xlab("Expected ratio") + ylab("Observed ratio") + ggtitle(Trim(File)[1,1]) + ylim(c(0,60)) + xlim(c(0,40)) # geom_boxplot() print(p) } dev.off() pdf("test.5.pdf") for(File in Files){ data <- read.table(File, sep = "\t", stringsAsFactors = F, header = T) data <- RemoveZeros(data) p <- ggplot(FlattenData(data, targetList, flip=as.numeric(Trim(File)[3,1]), dil = as.numeric(Trim(File)[2,1])), aes(x=ind, y=values, colour=prot), na.rm=T) + geom_point(shape=1,position=position_jitter(width=0.005), alpha = 0.2, na.rm=T) + geom_abline(intercept=0, slope=1) + stat_smooth(method = "glm", formula = y~x, na.rm=T) + stat_smooth(mapping = aes(ind, values, colour = "General Trend"), method = "glm", color="black", linetype=3, na.rm=T) + xlab("Expected ratio") + ylab("Observed ratio") + ggtitle(Trim(File)[1,1]) + ylim(c(0,60)) + xlim(c(0,40)) # geom_boxplot() print(p) } dev.off() pdf("test.6.pdf") for(File in Files){ data <- read.table(File, sep = "\t", stringsAsFactors = F, header = T) data <- RemoveZeros(data) p <- ggplot(FlattenData(data, targetList, flip=as.numeric(Trim(File)[3,1]), dil = as.numeric(Trim(File)[2,1])), aes(x=ind, y=values), na.rm=T) + geom_point(shape=1,position=position_jitter(width=0.005), alpha = 0.2, na.rm=T) + geom_abline(intercept=0, slope=1) + stat_smooth(method = "glm", formula = y~x, na.rm=T) + # stat_smooth(mapping = aes(ind, values), method = "glm" linetype=3, na.rm=T) + xlab("Expected ratio") + ylab("Observed ratio") + ggtitle(Trim(File)[1,1]) + ylim(c(0,60)) + xlim(c(0,40)) # geom_boxplot() print(p) } dev.off() pdf("test.7.pdf") # scale_data needs to be assigned, look at filename switches and determine the logical pattern to isolate the files you want. scale_stat <- -1 for(File in Files){ if(!scale_stat == Trim(File)[4,]) { scale_Data <- RemoveZeros(read.table(File, sep = "\t", stringsAsFactors = F, header = T)); scale_stat <- Trim(File)[4,]} data <- RemoveZeros(read.table(File, sep = "\t", stringsAsFactors = F, header = T)) p <- ggplot(ScaleData(data, targetList, scale_Data, flip=as.numeric(Trim(File)[3,1]), dil = as.numeric(Trim(File)[2,1])), aes(x=ind, y=values, colour=prot), na.rm=T) + geom_point(shape=1,position=position_jitter(width=0.005), alpha = 0.2, na.rm=T) + geom_abline(intercept=0, slope=1) + stat_smooth(method = "glm", formula = y~x, na.rm=T) + stat_smooth(mapping = aes(ind, values, colour = "General Trend"), method = "glm", color="black", linetype=3, na.rm=T) + xlab("Expected ratio") + ylab("Observed ratio") + ggtitle(Trim(File)[1,1]) + ylim(c(0,60)) + xlim(c(0,40)) # geom_boxplot() print(p) } dev.off() # scale_data needs to be assigned, look at filename switches and determine the logical pattern to isolate the files you want. scale_stat <- -1 # This is a repeat of test.7.pdf, but the plots are generated as individually named files to aid figure creation. for(File in Files){ pdf(paste(Trim(File)[1,], ".pdf")) if(!scale_stat == Trim(File)[4,]) { scale_Data <- RemoveZeros(read.table(File, sep = "\t", stringsAsFactors = F, header = T)); scale_stat <- Trim(File)[4,]} data <- RemoveZeros(read.table(File, sep = "\t", stringsAsFactors = F, header = T)) p <- ggplot(ScaleData(data, targetList, scale_Data, flip=as.numeric(Trim(File)[3,1]), dil = as.numeric(Trim(File)[2,1])), aes(x=ind, y=values, colour=prot), na.rm=T) + geom_point(shape=1,position=position_jitter(width=0.005), alpha = 0.2, na.rm=T) + geom_abline(intercept=0, slope=1) + stat_smooth(method = "glm", formula = y~x, na.rm=T) + stat_smooth(mapping = aes(ind, values, colour = "General Trend"), method = "glm", color="black", linetype=3, na.rm=T) + xlab("Expected ratio") + ylab("Observed ratio") + ggtitle(Trim(File)[1,1]) + ylim(c(0,60)) + xlim(c(0,40)) # geom_boxplot() print(p) dev.off() } # scale_data needs to be assigned, look at filename switches and determine the logical pattern to isolate the files you want. scale_stat <- -1 # As above, but with log-log scales for(File in Files){ pdf(paste(Trim(File)[1,], "-log.pdf")) if(!scale_stat == Trim(File)[4,]) { scale_Data <- RemoveZeros(read.table(File, sep = "\t", stringsAsFactors = F, header = T)); scale_stat <- Trim(File)[4,]} data <- RemoveZeros(read.table(File, sep = "\t", stringsAsFactors = F, header = T)) p <- ggplot(ScaleData(data, targetList, scale_Data, flip=as.numeric(Trim(File)[3,1]), dil = as.numeric(Trim(File)[2,1])), aes(x=ind, y=values, colour=prot), na.rm=T) + geom_point(shape=1,position=position_jitter(width=0.005), alpha = 0.2, na.rm=T) + geom_abline(intercept=0, slope=1) + stat_smooth(method = "glm", formula = y~x, na.rm=T) + stat_smooth(mapping = aes(ind, values, colour = "General Trend"), method = "glm", color="black", linetype=3, na.rm=T) + xlab("Expected ratio") + ylab("Observed ratio") + ggtitle(Trim(File)[1,1]) + scale_y_log10(limits=c(0.1,60)) + scale_x_log10(limits=c(0.99,40)) # geom_boxplot() print(p) dev.off() } scale_stat <- -1 File = Files[8] if(!scale_stat == Trim(File)[4,]) { scale_Data <- RemoveZeros(read.table(File, sep = "\t", stringsAsFactors = F, header = T)); scale_stat <- Trim(File)[4,]} data <- RemoveZeros(read.table(File, sep = "\t", stringsAsFactors = F, header = T)) tmp <- ScaleData(data, targetList, scale_Data, flip=as.numeric(Trim(File)[3,1]), dil = as.numeric(Trim(File)[2,1])) ## Scaling factors and corrections ## need to make a data structure containing: # median value # protein # label # tag type (itraq/tmt) # expected # correction factor ## Useful functions: # RemoveZeros(data) # Corrected.new(targetList[i], RemoveZeros(data))[,j] # median data <- read.table(Files[1], sep = "\t", stringsAsFactors = F, header = T) test <- ScalingMatrix(data) data <- read.table(Files[5], sep = "\t", stringsAsFactors = F, header = T) test2 <- ScalingMatrix(data) tmp <- rbind(test,test2) p <- ggplot(data = tmp, aes(x=Label, y=ScalingFactor, color=Protein)) + geom_point() + stat_smooth(method = "glm", formula = y~x+sin(x)+x^2, na.rm=T) + geom_hline(yintercept=1) print(p) p <- ggplot(data = test, aes(x=Label, y=ScalingFactor, color=Protein)) + geom_point() + stat_smooth(method = "glm", formula = y~sin(x)+cos(x), na.rm=T) + geom_hline(yintercept=1) print(p) p <- ggplot(data = test2, aes(x=Label, y=ScalingFactor, color=Protein)) + geom_point() + stat_smooth(method = "glm", formula = y~sin(x)+cos(x), na.rm=T) + geom_hline(yintercept=1) print(p) MATRIX <- c() for(File in Files[5:8]){ data <- read.table(File, sep = "\t", stringsAsFactors = F, header = T) data <- RemoveZeros(data) ROW <- c() for(i in unique(Tags(data)$CorrectedLabelIndex)){ ROW <- c(ROW, (median(data[,i]))) } MATRIX <- rbind(MATRIX,ROW) } tmp2 <- MATRIX tmp - mean(tmp) output <- c() for(i in 1:8){ output <- cbind(output, Corrected.new(PROT, RemoveZeros(data))[,i]*as.numeric(test$ScalingFactor[test$Protein==PROT][i]))} output <- as.data.frame(output) names(output) <- unique(Tags(data)$TagLabels) p <- ggplot(data = stack(output), aes(x=ind, y=values)) + geom_point(shape=1,position=position_jitter(width=0.005), alpha = 0.2, na.rm=T) + # stat_smooth(method = "glm", formula = y~x, na.rm=T) + # stat_smooth(mapping = aes(ind, values, colour = "General Trend"), method = "glm", color="black", linetype=3, na.rm=T) + # xlab("Expected relative ratio") + ylab("Observed relative ratio") + ggtitle(Trim(File)[1,1]) + # ylim(c(0,0.75)) + xlim(c(0,0.6)) print(p) pdf("test.7.pdf") print(p) dev.off()