setwd('C:/Users/MP/Documents/iGEM_2011/H2/')
Dataset <- read.delim(file='Main Data.txt', header = T)

################################################################ Cleaning
# This will remove a few lines from the dataset. These lines are
# intended to aid the reading of the original file and are
# useless in the current working file. The casting as a matrix
# is due to the fact that working with a DataFrame, as generated
# by the read.delim function, entails some extra information
# (eg: levels) that might clash later on. For example, without
# the casting the ReactionNames vector ends up with 391 elements
# but 395 levels, of which 4 are the removed lines...

Dataset <- as.matrix(Dataset[-c(375, 376, 389, 390, 392, 393),])

################################################################ Reaction Names
# This will generate a vector containing the reaction names for
# the metabolic reconstruction. They are ordered as they appear
# on the Abbreviation column of the Dataset. Notice that since
# the statement "length(unique(Dataset[,1])) ==
# dim(Dataset)[1]" returns TRUE, then we know that each element
# on that column is unique and unrepeated. As for the reaction
# full names, THOSE are not unique. Therefore we can't clean
# them out.

ReactionNames <- unique(Dataset[,1])
FullReactionNames <- Dataset[,2]

################################################################ External Data Call
# For the Metabolites, we will use a custom PERL script.
# Therefore, we output the data as an external file, then call
# a custom script to process it, and then read the output of
# said custom script.
# Note, since the function read.delim determines the column
# number from the first 5 lines of the file, and our data has
# bigger reactions beyond those 5 lines, we specify the column
# name attribute to force the function to have 10 fixed columns.
# Visual inspection of the data suggests the longest is a
# reaction of 6 elements, therefore the 10 upper limit is a
# safe approximation.

write(Dataset[,3], file='Equations.txt', sep='\t')
system(command='perl From_Equations_to_Reagents_and_Products.pl')
reagents_sp <- read.delim(file='Internal_Data/REAGENTS_depuration_5_species.txt', header = F, col.names=c(1:10))
reagents_st <- read.delim(file='Internal_Data/REAGENTS_depuration_5_stochiometry.txt', header = F, col.names=c(1:10))
products_sp <- read.delim(file='Internal_Data/PRODUCTS_depuration_5_species.txt', header = F, col.names=c(1:10))
products_st <- read.delim(file='Internal_Data/PRODUCTS_depuration_5_stochiometry.txt', header = F, col.names=c(1:10))

################################################################ Metabolites
# Having the data on the appropriate tables, we now turn to
# obtaining the list of unique Metabolites. We first merge all
# known Metabolites into a single large vector, then eliminate
# the redundancies. We also eliminate de NA and the "" elements.

Metabolites <- vector()
Metabolites <- c(as.matrix(reagents_sp), as.matrix(products_sp))
Metabolites <- unique(Metabolites)
Metabolites <- Metabolites[!is.na(Metabolites)]
Metabolites <- Metabolites[!as.character(Metabolites) == as.character("")]

################################################################ The Matrix
# Having the vector with the reaction names, as well as the
# vector with the Metabolites, we can now proceed to the
# construction of, THE MATRIX called S.
# We first add the data from the reagents, then from the
# products. Remember that reagents are negated since their flux
# is negative. Lastly, we substitute the NAs for 0s in
# The Matrix.

The_Matrix <- matrix(nrow=length(ReactionNames), ncol=length(Metabolites), dimnames=list(ReactionNames, Metabolites))

for (reaction in 1:length(ReactionNames)){
	for (agent_no in 1:10){
		if (!is.na(reagents_st[reaction,agent_no])){
			species <- as.character(reagents_sp[reaction,agent_no])
			The_Matrix[ReactionNames[reaction],species] <- as.numeric(-reagents_st[reaction, agent_no])
		}
	}
}

for (reaction in 1:length(ReactionNames)){
	for (agent_no in 1:10){
		if (!is.na(products_st[reaction,agent_no])){
			species <- as.character(products_sp[reaction,agent_no])
			The_Matrix[ReactionNames[reaction],species] <- as.numeric(reagents_st[reaction, agent_no])
		}
	}
}

for(reaction in 1:length(ReactionNames)){
	for (metabolite in 1:length(Metabolites)){
		if (is.na(The_Matrix[reaction,metabolite])){
			The_Matrix[reaction,metabolite] <- as.numeric(0)
		}
	}
}

################################################################ SubSystems
# Here we obtain the SubSystem information available for certain
# reactions.

SubSystems <- Dataset[,4]

################################################################ Genes
# Here we obtain the set of genes of interest in the system.
# They are ordered as they appear on the 5th column of the big
# object named Dataset.

Genes_Raw <- Dataset[,5]
Genes <- vector()
Genes <- c(Genes, as.vector(strsplit(Genes_Raw, ", ")))
Genes <- unlist(Genes, use.names = F)
Genes <- unique(Genes)

################################################################ Gene Regulation Rules
# Here we export the column containing the information of gene
# regulation. They will be processed by MatLab directly. We also
# generate the table for the proteins coded by said genes.

GrRules <- Dataset[,5]

################################################################ Output
# Having triumphantly parsed that data into a suitable object of
# reverence called The_Matrix, we now output all these data into
# separate files: one for ReactionNames, one for Metabolites,
# and one for The_Matrix.

write(ReactionNames, file='ReactionNames.txt')
write(FullReactionNames, file='FullReactionNames.txt')
write(Metabolites, file='Metabolites.txt')
write.csv(The_Matrix, file='The_Matrix.txt')
write(SubSystems, file='SubSystems.txt')
write(Genes, file='Genes.txt')
write(GrRules, file='GrRules.txt')

################################################################ Onward...
# Now go see what that big MatLab code file contains!