# HELLO PPI WORLD! #Disclaimer #The PINOT webportal and the PINOT scripts are offered for free and only for private, non-commercial use. #The authors do not guarantee or warrant that the information and material (Information) is up-to-date, complete, accurate, free of defects, errors and/or viruses. #If you choose to rely on any of the Information, you do so at your own risk. #We reserve the right to interrupt the availability of and/or cease to publish the PINOT webportal and/or scripts for any reason and without notice. #To the extent permitted by law, the authors and the University of Reading do not accept any liability for any damage or loss arising in relation to your use of the webportal and/or scripts and/or the Information. #PINOT links to external websites and resources that are independent and not under the control of the authors. #The authors have no control over and are not liable for the nature, content and availability of those sites. ################################################################################################################################################################################################## # 1 - download the data from the IMEX consortium using API provided in the PSICQUIC library ################################################################################################################################################################################################## # install and load the PSICQUIC library from Bioconductor # PSICQUIC is a project within the HUPO Proteomics Standard Initiative (HUPO-PSI). It standardises programmatic access to molecular interaction databases. # Author: Paul Shannon # Maintainer: Paul Shannon # Citation: Shannon P (2018). PSICQUIC: Proteomics Standard Initiative Common QUery InterfaCe. R package version 1.18.0. From within R, enter citation("PSICQUIC") library(PSICQUIC) # change the current working directory (NOTE: all files/folders should be placed in the cwd) # create 2 folders in the cwd named "Downloads" and "LogFiles" cwd <- "ADD HERE CWD" setwd(cwd) # load the Protein list from the cwd. This should be a txt file, single column, no header. Every protein should be reported with their HUGO name. file.name <- "ADD HERE FILE NAME.TXT" Plist <- read.table (file.name, header=FALSE, sep="\t", stringsAsFactors=FALSE) Plist <- as.vector(Plist[,1]) # the default repositories used for download are: "BioGrid", "bhf-ucl", "InnateDB", "IntAct", "MINT", "UniProt", "MBInfo" # if one of these is off-line it will be skipped and a final list of repositories will be generated for the user pvpv=c("BioGrid", "bhf-ucl", "IntAct", "MINT", "UniProt", "MBInfo", "InnateDB") psicquic <- PSICQUIC() prov <- providers(psicquic) online <- ("remove") i <- 1 for (i in 1:7){ pv <- match (pvpv[i], prov) if ((is.na(pv)) == FALSE) { online <- c(online, pvpv[i]) } i <- i+1 } online <- online[-c(1)] write.table (online, "available.providers.txt", quote=FALSE, row.names=FALSE, col.names=FALSE) i<- 1 j<- 1 nores <- integer() end <- length(Plist) for (i in 1:end) { ID <- Plist[i] psicquic <- PSICQUIC() # the default downloaded species is human - 9606 res <- interactions(psicquic, id=ID, species="9606", provider=online) nr <- nrow(res) # this checks that data has been downloaded - identifies empty files - write the NON empty files in the "Downloads" folder if (nr >=1) { nameres <- paste(cwd, "/Downloads/", ID, ".txt", sep="") write.table (res, nameres, quote=FALSE, sep= "\t", row.names=FALSE) } else { nores[j] <- ID j <- j+1 } i<-i+1 } # the protein names that are not recognized or that don't have any partner reported (empty files) are written in a report and saved in the LogFiles folder namenores <- paste(cwd, "/LogFiles/", "IDS.EXCLUDED.NO-data.txt", sep="") write.table (nores, namenores, quote=FALSE, sep= "\t", row.names=FALSE) ############ # FLAG to check there are files for the next steps (i.e. not all the seeds have been dropped) flag.directory <- paste(cwd, "/Downloads", sep="") setwd (flag.directory) flag.serie <-list.files() f <- (length(flag.serie) >=1) FLAG <- "YES" if (f == FALSE) { FLAG <- "NO" } ################################################################################################################################################################################################## ################################################################################################################################################################################################## # 2- run the script to convert the nomenclature to UniprotID + GeneID. Entries that are chemicals or that are just annotated to TrEMBL IDs will be converted to NA ################################################################################################################################################################################################## # create 1 folder in the cwd named "Converted-Name" # this line moves to the script folder and loads the conversion.name script if (FLAG == "YES") { wd <- paste(cwd, "/Network-Scripts", sep="") setwd(wd) source("conversion.name.r") # this reads all the files that have been downloaded and passes them to the conversion.name script setwd(paste(cwd, "/Downloads", sep="")) serie <-list.files() total <-length(serie) K <-1 for (K in 1:total){ file.name <-serie[K] ls <- read.csv(file.name, header=TRUE, sep="\t", stringsAsFactors=FALSE, quote='') conversion.name(ls, file.name, wd, cwd) K <- K+1 } ############ # FLAG to check there are files for the next steps (i.e. not all the seeds have been dropped) flag.directory <- paste(cwd, "/Converted-Name", sep="") setwd (flag.directory) flag.serie <-list.files() f <- (length(flag.serie) >=1) FLAG <- "YES" if (f == FALSE) { FLAG <- "NO" } } ################################################################################################################################################################################################## ################################################################################################################################################################################################## # 3- run the script to modify the first line of each downloaded/ID-converted file so that in position 1:1 is the nameID of the seed of interest (as reported in the file name) # sometimes it happens that the ID used as seed name at the download is not the official name of the seed protein ! - in this case the file will be discarder and written in the LogFiles folder with the extension "SEED-NAME-notFOUND.txt" # suggestion - check the LogFile folder and the files have been discarded for suggestions on the official name that may be used instead ################################################################################################################################################################################################## # create 1 folder in the cwd named "First-Position" # this moves to the script folder and loads the "First-Position" script if (FLAG == "YES") { wd <- paste(cwd, "/Network-Scripts", sep="") setwd(wd) source("FirstPosition.r") # this reads all the files that have been downloaded/ID-converted and passes them to the FirstPosition script setwd(paste(cwd, "/Converted-Name", sep="")) serie <-list.files() total <-length(serie) K <-1 for (K in 1:total){ file.name <-serie[K] ls <- read.csv(file.name, header=TRUE, stringsAsFactors=FALSE, sep="\t") FirstPosition(ls, file.name, wd, cwd) K <- K+1 } ############ # FLAG to check there are files for the next steps (i.e. not all the seeds have been dropped) flag.directory <- paste(cwd, "/First-Position", sep="") setwd (flag.directory) flag.serie <-list.files() f <- (length(flag.serie) >=1) FLAG <- "YES" if (f == FALSE) { FLAG <- "NO" } } ################################################################################################################################################################################################## ################################################################################################################################################################################################## # 4- run the script to parse the file and performs QC. #i) All the entries will be parsed so that the seed protein is placed as "Interactor A" #ii) All the entries where the interactor name (A or B) has not been converted (NA) will be removed #iii) All the entries with more PMIDs or NO PMID or PMID=0 or PMID=unassigned will be removed #iv) All files that do not survive QC (i.e. all entries have multiple or missing PMID) will be written in the LogFiles with the extension "QCed-removed.txt" ################################################################################################################################################################################################## # create 1 folder in the cwd named "Parsed-QCed" # this moves to the script folder and loads the "Parse" script if (FLAG == "YES") { wd <- paste(cwd, "/Network-Scripts", sep="") setwd(wd) source("Parse.r") # this reads all the files that have been downloaded/ID-converted/First-Position-checked and passes them to the Parse/QC script setwd(paste(cwd, "/First-Position", sep="")) serie <-list.files() total <-length(serie) K <-1 for (K in 1:total){ file.name <-serie[K] Parse(file.name, wd, cwd) K <- K+1 } ############ # FLAG to check there are files for the next steps (i.e. not all the seeds have been dropped) flag.directory <- paste(cwd, "/Parsed-QCed", sep="") setwd (flag.directory) flag.serie <-list.files() f <- (length(flag.serie) >=1) FLAG <- "YES" if (f == FALSE) { FLAG <- "NO" } } ################################################################################################################################################################################################## ################################################################################################################################################################################################## # 5- run the script to reassign the Interaction Detection Method # Interaction detection method is re-assigned based on grouping of methods in classes of similarity # Some entries are removed if the interaction detection method field is not specified # The file is removed if ALL the entries have an interaction detection method that is not specified and written in the LogFiles folder # 5- AUTOMATED # Note the procedure can be carried on automatically - Reassignment.Method.AUTOMATIC.r - in this case: #i)the entries that are associated with an interaction method that is not recognized will be lost # 5- SEMI-AUTOMATED # Note the procedure can be carried on semi-automatically - Reassignment.Method.r - in this case: #i)the entries that are associated with an interaction method that is not recognized will be flagged-up and should be corrected manually before moving to step 6 ################################################################################################################################################################################################## # 5-AUTOMATED # create 1 folder in the cwd named "Method-Reassigned" and "temp" # this line moves to the script folder and loads the "Reassignment.Method.AUTOMATIC" script if (FLAG == "YES") { wd <- paste(cwd, "/Network-Scripts", sep="") setwd(wd) source("Reassignment.Method.AUTOMATIC.FILTER.R") # decide is FILTER should be lenient or stringent FILTER <- "lenient" or "stringent" # this reads all the files that have been downloaded/ID-converted/First-Position-checked/Parsed&QCed and passes them to the Method Reassignment script setwd(paste(cwd, "/Parsed-QCed", sep="")) serie <-list.files() total <-length(serie) K <-1 for (K in 1:total){ file.name <-serie[K] Reassignment.Method.AUTOMATIC.FILTER(file.name, wd, cwd, FILTER) K <- K+1 } # this produces a txt file (Summary_nonCONVERTED_Methods.txt in the temp folder) in which interaction methods that have not been recognized/converted or that have not passed QC (unassigned methods) are counted # this is just to collect a record regarding the situation is case is needed setwd(paste(cwd, "/temp/", sep="")) serie <-list.files(all.files=FALSE) total <-length(serie) count <- character () o <- 1 for (o in 1:total){ file.name <-serie[o] file <- read.table(file.name, sep="\t", head=TRUE) file <- as.vector (file) count <- rbind(count, file) o <- o +1 } dir <- paste(cwd, "/temp/Summary_nonCONVERTED_Methods.txt", sep="") write.table(count, dir, quote=FALSE, sep="\t", row.names=FALSE) ############ # FLAG to check there are files for the next steps (i.e. not all the seeds have been dropped) flag.directory <- paste(cwd, "/Method-Reassigned", sep="") setwd (flag.directory) flag.serie <-list.files() f <- (length(flag.serie) >=1) FLAG <- "YES" if (f == FALSE) { FLAG <- "NO" } } ################################################################################################################################################################################################## # 5-SEMI-AUTOMATED # create 2 folders in the cwd named "Method-Reassigned" and "temp" # this moves to the script folder and loads the "Reassignment.Method" script if (FLAG == "YES") { wd <- paste(cwd, "/Network-Scripts", sep="") setwd(wd) source("Reassignment.Method.FILTER.R") # decide is FILTER should be lenient or stringent FILTER <- "lenient" or "stringent" # this reads all the files that have been downloaded/ID-converted/First-Position-checked/Parsed&QCed and passes them to the Method Reassignment script setwd(paste(cwd, "/Parsed-QCed", sep="")) serie <-list.files() total <-length(serie) K <-1 for (K in 1:total){ file.name <-serie[K] Reassignment.Method.FILTER.R(file.name, wd, cwd, FILTER) K <- K+1 } # this produces a txt file (Summary_nonCONVERTED_Methods.txt in the temp folder) in wich interaction methods that have not been recognized/converted or that have not passed QC (unassigned methods) are counted # this file is important because now it can be checked to see if there are unrecognised/unconverted methods to be converted manually # to correct manually the unrecognised/unconverted methods open the txt file and search for NULL - then manually convert the method setwd(paste(cwd, "/temp/", sep="")) serie <-list.files(all.files=FALSE) total <-length(serie) count <- character () o <- 1 for (o in 1:total){ file.name <-serie[o] file <- read.table(file.name, sep="\t", head=TRUE) file <- as.vector (file) count <- rbind(count, file) o <- o +1 } dir <- paste(cwd, "/temp/Summary_nonCONVERTED_Methods.txt", sep="") write.table(count, dir, quote=FALSE, sep="\t", row.names=FALSE) } ################################################################################################################################################################################################## ################################################################################################################################################################################################## # 6- run the script to i)remove interactions that have been annotate and duplicated in different databases; ii)score the interactions based on publications and interaction detection methods ################################################################################################################################################################################################## # create 1 folder in the cwd named "Scored" # this moves to the script folder and loads the "score" script if (FLAG == "YES") { wd <- paste(cwd, "/Network-Scripts", sep="") setwd(wd) source("score.r") # this reads all the files that have been downloaded/ID-converted/First-Position-checked/Parsed&QCed/MethodReassigned and passes them to the scoring script setwd(paste(cwd, "/Method-Reassigned", sep="")) serie <-list.files() total <-length(serie) K <-1 for (K in 1:total){ file.name <-serie[K] score(file.name, wd, cwd) K <- K+1 } ################################################################################################################################################################################################## ################################################################################################################################################################################################## # 7- combine the results in a network file ################################################################################################################################################################################################## setwd(paste(cwd, "/Scored", sep="")) serie <-list.files(all.files=FALSE) total <-length(serie) count <- character () o <- 1 for (o in 1:total){ file.name <-serie[o] file <- read.table(file.name, sep="\t", head=TRUE) count <- rbind(count, file) o <- o +1 } name <- paste(cwd, "/final_network.txt", sep="") write.table(count, name, quote=FALSE, sep="\t", row.names=FALSE) #this summarizes all the IDs of the excluded seeds and reports them in the log file setwd(paste(cwd, "/LogFiles", sep="")) serie <-list.files(all.files=FALSE) IDexcluded <- read.table ("IDS.EXCLUDED.NO-data.txt", header=TRUE) FLG <- FALSE if ((nrow(IDexcluded))==0){ FLG <- TRUE } if (FLG == FALSE){ IDexcluded <- as.vector(IDexcluded[,1]) proteins_excluded <- c(IDexcluded, serie) val <- match ("IDS.EXCLUDED.NO-data.txt", proteins_excluded) proteins_excluded <- proteins_excluded[-c(val)] i<-1 end <- length (proteins_excluded) res <- integer() for (i in 1:end) { pp <- proteins_excluded[i] ppp <- strsplit(pp, ".F", fixed=TRUE) pppp <- data.frame(matrix(unlist(ppp), byrow=T), stringsAsFactors=FALSE) res[i] <- pppp[1,] i <- i+1 } } if (FLG == TRUE){ proteins_excluded <- serie if ((length(proteins_excluded))!=1) { val <- match ("IDS.EXCLUDED.NO-data.txt", proteins_excluded) proteins_excluded <- proteins_excluded[-c(val)] i<-1 end <- length (proteins_excluded) res <- integer() for (i in 1:end) { pp <- proteins_excluded[i] ppp <- strsplit(pp, ".F", fixed=TRUE) pppp <- data.frame(matrix(unlist(ppp), byrow=T), stringsAsFactors=FALSE) res[i] <- pppp[1,] i <- i+1 } } else { res <- ("NO proteins removed") } } res <- c("proteins_dropped",res) name2 <- paste(cwd, "/log.txt", sep="") write.table(res, name2, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) #The final_network file contains all the info to create the network graph. #The score can be filtered at >2 to keep only interactions that have been validated (repeated at least 1 time with different methods or different publications) #NOTE: if the paper is annotated in 2 different repositories with different interaction detection methods this will count as replication because there is no way to discriminate if this is a false or a positive annotation } if (FLAG == "NO") { res <- ("ALL proteins were removed no network can be created") res <- c("proteins_dropped",res) name2 <- paste(cwd, "/log.txt", sep="") write.table(res, name2, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) } ################################################################################################################################################################################################## # JOB DONE!