setwd('C:/Users/MP/Documents/iGEM_2011/H2/') Dataset <- read.delim(file='Main Data.txt', header = T) ################################################################ Cleaning # This will remove a few lines from the dataset. These lines are # intended to aid the reading of the original file and are # useless in the current working file. The casting as a matrix # is due to the fact that working with a DataFrame, as generated # by the read.delim function, entails some extra information # (eg: levels) that might clash later on. For example, without # the casting the ReactionNames vector ends up with 391 elements # but 395 levels, of which 4 are the removed lines... Dataset <- as.matrix(Dataset[-c(375, 376, 389, 390, 392, 393),]) ################################################################ Reaction Names # This will generate a vector containing the reaction names for # the metabolic reconstruction. They are ordered as they appear # on the Abbreviation column of the Dataset. Notice that since # the statement "length(unique(Dataset[,1])) == # dim(Dataset)[1]" returns TRUE, then we know that each element # on that column is unique and unrepeated. As for the reaction # full names, THOSE are not unique. Therefore we can't clean # them out. ReactionNames <- unique(Dataset[,1]) FullReactionNames <- Dataset[,2] ################################################################ External Data Call # For the Metabolites, we will use a custom PERL script. # Therefore, we output the data as an external file, then call # a custom script to process it, and then read the output of # said custom script. # Note, since the function read.delim determines the column # number from the first 5 lines of the file, and our data has # bigger reactions beyond those 5 lines, we specify the column # name attribute to force the function to have 10 fixed columns. # Visual inspection of the data suggests the longest is a # reaction of 6 elements, therefore the 10 upper limit is a # safe approximation. write(Dataset[,3], file='Equations.txt', sep='\t') system(command='perl From_Equations_to_Reagents_and_Products.pl') reagents_sp <- read.delim(file='Internal_Data/REAGENTS_depuration_5_species.txt', header = F, col.names=c(1:10)) reagents_st <- read.delim(file='Internal_Data/REAGENTS_depuration_5_stochiometry.txt', header = F, col.names=c(1:10)) products_sp <- read.delim(file='Internal_Data/PRODUCTS_depuration_5_species.txt', header = F, col.names=c(1:10)) products_st <- read.delim(file='Internal_Data/PRODUCTS_depuration_5_stochiometry.txt', header = F, col.names=c(1:10)) ################################################################ Metabolites # Having the data on the appropriate tables, we now turn to # obtaining the list of unique Metabolites. We first merge all # known Metabolites into a single large vector, then eliminate # the redundancies. We also eliminate de NA and the "" elements. Metabolites <- vector() Metabolites <- c(as.matrix(reagents_sp), as.matrix(products_sp)) Metabolites <- unique(Metabolites) Metabolites <- Metabolites[!is.na(Metabolites)] Metabolites <- Metabolites[!as.character(Metabolites) == as.character("")] ################################################################ The Matrix # Having the vector with the reaction names, as well as the # vector with the Metabolites, we can now proceed to the # construction of, THE MATRIX called S. # We first add the data from the reagents, then from the # products. Remember that reagents are negated since their flux # is negative. Lastly, we substitute the NAs for 0s in # The Matrix. The_Matrix <- matrix(nrow=length(ReactionNames), ncol=length(Metabolites), dimnames=list(ReactionNames, Metabolites)) for (reaction in 1:length(ReactionNames)){ for (agent_no in 1:10){ if (!is.na(reagents_st[reaction,agent_no])){ species <- as.character(reagents_sp[reaction,agent_no]) The_Matrix[ReactionNames[reaction],species] <- as.numeric(-reagents_st[reaction, agent_no]) } } } for (reaction in 1:length(ReactionNames)){ for (agent_no in 1:10){ if (!is.na(products_st[reaction,agent_no])){ species <- as.character(products_sp[reaction,agent_no]) The_Matrix[ReactionNames[reaction],species] <- as.numeric(reagents_st[reaction, agent_no]) } } } for(reaction in 1:length(ReactionNames)){ for (metabolite in 1:length(Metabolites)){ if (is.na(The_Matrix[reaction,metabolite])){ The_Matrix[reaction,metabolite] <- as.numeric(0) } } } ################################################################ SubSystems # Here we obtain the SubSystem information available for certain # reactions. SubSystems <- Dataset[,4] ################################################################ Genes # Here we obtain the set of genes of interest in the system. # They are ordered as they appear on the 5th column of the big # object named Dataset. Genes_Raw <- Dataset[,5] Genes <- vector() Genes <- c(Genes, as.vector(strsplit(Genes_Raw, ", "))) Genes <- unlist(Genes, use.names = F) Genes <- unique(Genes) ################################################################ Gene Regulation Rules # Here we export the column containing the information of gene # regulation. They will be processed by MatLab directly. We also # generate the table for the proteins coded by said genes. GrRules <- Dataset[,5] ################################################################ Output # Having triumphantly parsed that data into a suitable object of # reverence called The_Matrix, we now output all these data into # separate files: one for ReactionNames, one for Metabolites, # and one for The_Matrix. write(ReactionNames, file='ReactionNames.txt') write(FullReactionNames, file='FullReactionNames.txt') write(Metabolites, file='Metabolites.txt') write.csv(The_Matrix, file='The_Matrix.txt') write(SubSystems, file='SubSystems.txt') write(Genes, file='Genes.txt') write(GrRules, file='GrRules.txt') ################################################################ Onward... # Now go see what that big MatLab code file contains!