/* This is a program to create genome Sequences and align genome subsequences using Longest Common Subsequence (LCS). The program runs on a Unix platform and can be run by entering the size of the sequence and the number of subsequences form the command prompt. @Author : Sara Nasser @Date Created: January 26, 2006 @Date Modified: February 28, 2006 @Version : 3.2 @Modified by: Sara Nasser (c) This code is created by the bioinformatics research group in the Department of Computer Science at the University of Nevada Reno. */ //System includes #include #include #include #include #include #include //user directives #define display 0 //Display turned on when set to 1 and off when set to 0 //#define NSQ 100 // no of sub sequences /* Function prototypes */ void Subsequence(int, int); void splicer(); void LCS(char *, char*, char *, int, int&); void substring(char* ,char* , int , int ); void mergeSub(char* ,char* ,char* , char * , int); void align(char**, int, char*, ofstream&); void display_html_original(ofstream& , char **, int); void display_html_1(ofstream&, char** ); void display_html_2(ofstream&, char* ,char*, float); void display_html_merged(ofstream&, char *); void display_error(ofstream& , int , int); int SIZE=300; // the size of the genome sequence /***************************************************************** The main functions reads two variables from the command prompt, @return: int, 0 if the programs runs successfully, 1 if it does not @param :argc the number of arguments, argv the arguments as strings ******************************************************************/ int main(int argc, char* argv[]) { char newSeq[SIZE+1], nextSeq[SIZE+1], overlap[SIZE+1], mergedSeq[SIZE+1]; char mainSeq[SIZE+1]; int noSeq; int overlapSize,oldOverSize=0; int type; //int NSQ=100; SIZE=atoi(argv[1]); // get the size of the main sequence from the user int NSQ=atoi(argv[2]); //get the number of subsequences desired //if sequence size is small and number of subsequences are less than two display an error message if(SIZE < 10 || NSQ <2 ) { if(display ==1) { //If user chooses display, a html file is created that contains the results. ofstream fout2; fout2.open("genome.html"); display_error(fout2, SIZE, NSQ); } return 0; } cout< endse) { temp=start; start=endse; endse=temp; } while(start==endse)// regenerate sub sequence if empty { start=rand()%seqSize; endse=rand()%seqSize; if(start > endse) { temp=start; start=endse; endse=temp; } } // cout<<"Start "<0 && type>0) //has a match, creates an empty mergedSubst if not checked, to be used later to define size of overlap { mergeSub(data[i], data[j],overlap, mergedSeq, type); //show only new sequences if((strlen(mergedSeq) > strlen(data[i])) && (strlen(mergedSeq) > strlen(data[j]))) { //pick up the longest merged subsequence if(strlen(mergedSeq) >= strlen(maxSeq)) { memcpy(maxSeq,mergedSeq,strlen(mergedSeq)*sizeof(char)+1); } } } }//for //copy the longest sub sequence memcpy(data[i], maxSeq, strlen(maxSeq)*sizeof(char)+1); i++; }//while */ ////Combine the results into one final sequence j=0; int oldmax; maxlen=0; while((j < nsq) && (maxlen0 && type>0) //has a match, creates an empty mergedSubst if not checked, to be used later to define size of overlap { mergeSub(data[j], data[i],overlap, mergedSeq, type); //show only new sequences if((strlen(mergedSeq) > strlen(data[i])) && (strlen(mergedSeq) > strlen(data[j]))) { //pick up the longest merged subsequence if(strlen(mergedSeq) >= strlen(maxSeq)) { //if(display) //display_html_original(fout2,data, nsq); memcpy(maxSeq,mergedSeq,strlen(mergedSeq)*sizeof(char)+1); if(strlen(mergedSeq) >= maxlen) maxlen=strlen(maxSeq); } }//if }//if }//if }//for //copy the longest sub sequence //If subsequence does not match any of the other merged/unmerged //subsequences we make it null, so that we don't use it for //any further alignment. // if(strlen(maxSeq)==0) // data[j]=NULL; // else data[j]=maxSeq; //cout<<"new res="<= strlen(mRes)) { if(display==1) display_html_original(fout2, data, nsq); memcpy(mRes,data[j],strlen(data[j])*sizeof(char)+1); //cout<<"Max so far"< oldmax)) { j=0; } }//while } /********************************************************************************** Function to put back the subsequences, using LCS -Dynamic programming Conventions used -1 for up, -2 for left and -3 for up and left in the table @return: void @param :x and y are the sub sequences to be compared, z is the overlap sub sequences, style is the type of overlap 2 :contiguous and 1:non-contiguous. type is a return variable 1 if z is a suffix of x prefix of y 2 if z is a suffix of y prefix of x 0 if z is not a suffix or prefix of x,y ***********************************************************************************/ void LCS(char *x, char *y, char *z, int style, int &type) { //variables int i,j, xstart, ystart, xend, yend; int x_l, y_l; x_l=strlen(x); //lenght of string x and y y_l=strlen(y); //if strings are not empty if(x_l!=0 && y_l!=0) { int **c, **b; //allocate space c= new int* [x_l+1]; b= new int* [x_l+1]; for(i=0; i<=x_l; i++) { b[i]=new int[y_l+1]; c[i]=new int[y_l+1]; for(j=0; j<=y_l; j++) b[i][j]=0; } for(i=0; i<=x_l; i++) //initialize the tables b and c { c[i][0]=0; b[i][0]=-1; //-1 indicates up } for(i=0; i<=y_l; i++) //initialize the tables b and c { c[0][i]=0; b[0][i]=-2; //-2 indicates left } //dynamic programming, constructs the table that can be used to reconstruct the LCS for(i=1; i<=x_l; i++) for(j=1; j<=y_l; j++) { if(x[i-1]==y[j-1]) //two characters in the strings match { c[i][j]=c[i-1][j-1]+1; b[i][j]=-3; //indicates up and left } else if(c[i-1][j]>=c[i][j-1]) { c[i][j]=c[i-1][j]; b[i][j]=-1; //up } else { c[i][j]=c[i][j-1]; b[i][j]=-2; //left } } //the largest no is the length of the common string int len=c[x_l][y_l]; int start, stop=0, max, oldstart, oldstop, ctr, len2=0; int jcopy; z[len]='\0'; i=x_l; j=y_l; //back track oldstop=0; stop=0; ctr=0; oldstart=0; yend=0; start=i; jcopy=j; while(i>0 || j>0) { if(b[i][j]==-3) { i--; j--; len--; z[len]=x[i]; //copy the character if(ctr==0) //save the starting position { start=i; jcopy=j; stop=0; ctr=1; } else { stop++; if(i==0) { if(stop>=oldstop) { oldstop=stop; oldstart=start; yend=jcopy; } } } }//if -3 else if(b[i][j]==-1) { i--; if(stop>=oldstop) { oldstop=stop; oldstart=start; yend=jcopy; } ctr=0; }//else if -1 else if(b[i][j]==-2) { j--; if(stop>=oldstop) { oldstop=stop; oldstart=start; yend=jcopy; } ctr=0; }//else if -2 }//while int i1,j1, count=0, oldcount=0; i=oldstart-oldstop+1; j=yend-oldstop+1; oldstart=xstart=i; ystart=j; //to find the LCS from the starting position if(style==2) { while( i<=x_l && j<=y_l &&(b[i][j]==-3)) { i++; j++; count++; } xend=oldstart=i-1; yend=j-1; oldstop=count; } if(oldstop ==0) { z[0]='\0'; type=0; // no match } else if(oldstop>0 && style==2) //if there is a match { xstart=oldstart-oldstop; xend=oldstart; ystart=yend-oldstop; oldstart--; substring(z, x, xstart, oldstart); //we only want it if its more than 1 char //Determine if the common substring is at the ends if((xend ==x_l) && (ystart==0)) //suffix of x prefix of y { type = 1; } else if((xstart==0) && (yend==y_l)) //suffix of y prefix of x { type = 2; } else type = 0; // not a suffix or prefix } //free space for(i=0; i<=x_l; i++) { delete[] b[i]; delete[] c[i]; } delete[] b; delete[] c; }// if x_l !=0 .... } /************************************************************************ Function to merge two subsequences based on an exact match at the edges. @return: void @param : x,y strings to be merged overlap string LCS z result merged string type 1 if overlap is a suffix of x prefix of y 2 if overlap is a suffix of y prefix of x 0 if overlap is not a suffix or prefix of x,y **************************************************************************/ void mergeSub(char* x,char* y,char* overlap, char * z, int type) { int i,j; int len_m=strlen(overlap); int len_x=strlen(x); int len_y=strlen(y); if(type==1) // suffix of x prefix of y { strcpy(z,x); //copy x to new string for(i=len_x, j=len_m; j"; //display_html_original(fout2,data); } /******************************************************* This function creates a html display, This is one of functions for display. It finalizes the html file @return: void @param :fout2 is the ofstream instance which points to the html file, orig the original main sequence merge the final merged sequence time in seconds, total time for alignment ********************************************************/ void display_html_2(ofstream& fout2, char *orig, char *merge, float time) { //also display time, length etc.... fout2<<"

"; fout2<<"
Original Genome Sequence:"<<""<"; fout2<<"
Length of Sequence: "<"; fout2<<"
Final Genome Sequence     :"<<""<"; fout2<<"
Length of Sequence: "<"; fout2<<"
Time:"<<""< seconds"; fout2<<""; } /******************************************************* * This function creates a html display, This is one of the 5 functions for display. It writes the sequences to the html file @return: void @param : fout2 is the ofstream instance which points to the html file, data array with subsequences NSQ the number of subsquences ********************************************************/ void display_html_original(ofstream& fout2, char **data, int NSQ) { int i,j; int len; // char seq[DATA+1]; fout2<<"
"; fout2<<""; for(i=0; i"; if(strlen(data[i])<16 && strlen(data[i])> 0) { if(i%2) { fout2<<"
"; fout2<
"; } else { fout2<<"
"; fout2<
"; } } else if(strlen(data[i])>=16) { len=strlen(data[i]); if(i%2) { fout2<<"
"; fout2<
"; // fout2<<"

-------"<"; fout2<"; //fout2<"; } } fout2<<""; // fout2<<" \; \; \;"; } fout2<<""; } /***************************************************************************** This function creates a html display, This is one of the 5 functions for display. It writes the final string to the html file @return: void @param :fout2 is the ofstream instance which points to the html file merge the final merged sequence ******************************************************************************/ void display_html_merged(ofstream& fout2, char *merged) { int len; len=strlen(merged); fout2<<"

"; fout2<<"
"; fout2<<"
"; fout2<<"The longest Subsequence found so far...
"<"<<"Lenght ="<"; } /************************************************************************ * This function displays an error in the browser if the size is small or * the number of subsequences areless * @return: void @param :fout2 is the ofstream instance which points to the html file, SIZE is the length of the sequence NSQ the number of subsquences * ********************************************************/ void display_error(ofstream& fout2, int SIZE, int NSQ) { //also display time, length etc.... fout2<<""; fout2<<"

"; fout2<<"
The Program did not run"<<""<<"The reason is below"<<""; fout2<<"
Length of Original Sequence: "<"; fout2<<"
Number of SubSequences: "<"; fout2<<" Length of Original Sequence should be greater than 10 and number of subsequences should be more than 2"; fout2<<""; }