/* MOTIF

   This program looks for a sequence motif defined by the user in the
   sequence(s) contained in the database file specified.

   The program can be invoked either by use of command line parameters or
   in an interactive mode when the user is prompted for replies.
   Help on the command line parameters can be obtained by typing:

       MOTIF HELP or MOTIF ?.

   The program is written in standard ANSI C.  In this source file conditional
   compliation for the Borland Turbo C complier (IBM PC) or the
   Waterloo compiler for an IBM 3090 mainframe can be selected by setting
   the appropriate define to TRUE and the other to FALSE.  Most constants
   set by the conditional compliation are concerned with the underlying
   file naming conventions on the different systems and with the amount of
   memory available.  The code is written to be able to run on either
   ASCII or EBCDIC encoded machines.

   Please acknowledge the original description of this program in any
   publications:

       Cockwell & Giles (1989) Comput Applic Biosci  5, 227-232
*/

/*  17.10.89: Small bug fix in GETSEQ (R. Fuchs), approved by I. Giles */
/*  18.10.89: Ported to VAX C by R. Fuchs (FUCHS@EMBL). Uses Curses, so make
              sure that LNK$LIBRARY is defined as SYS$LIBRARY:VAXCCURSE and
              LNK$LIBRARY_1 as SYS$LIBRARY:VAXCRTL */

#include <stdio.h>
#include <string.h>
#include <ctype.h>

#define FALSE 0
#define TRUE 1
#define MAXMATRIX 40
#define MAXQ 100
#define ARRSIZ 256
#define MAXLINE 512

#define waterloo FALSE
#define turboc FALSE
#define VAX TRUE

#if (turboc)
#include <conio.h>
#define RESFILE "MOTIF.OUT"
#define ERRLOG "CON"
#define SCANNAME "SCA"
#define MAXSEQ  10000
#define FILENAMELEN 80
#define cls clrscr
#endif

#if (VAX)
#include <curses.h>
#define RESFILE "MOTIF.OUT"
#define ERRLOG "screen"
#define SCANNAME "SCA"
#define MAXSEQ  200000
#define FILENAMELEN 80
#define cls clear
#endif

#if (waterloo)
#include <stdlib.h>
#include <file.h>
#define RESFILE "MOTIF OUTPUT A"
#define ERRLOG "TERMINAL"
#define MAXSEQ 200000
#define SCANNAME "SCA"
#define FILENAMELEN 20
#define strchr index
#define cls clear
#endif

int dlength, matches, option, qlength, totalseq;
int left, right;
char setmatrix[ARRSIZ][MAXMATRIX];
char uptable[256];
char qseq[MAXQ], *dseq;
char ename[FILENAMELEN], rname[FILENAMELEN], lname[FILENAMELEN];
char dbname[71];
FILE *rfil, *lfil, *efil;

main(argc,argv)
   int argc;
   char *argv[];
{
    int test;
    char *calloc();

#if (VAX)
    initscr();
#endif

    settranstable();
    if(argc >1) getnames(argc,argv);
    else interractive();

    test = databasecheck(lname);
    openfiles();
    iub();
    makesetmatrix();

    if((dseq=calloc(MAXSEQ,sizeof(char))) == 0){
        fprintf(efil,"ERROR: cannot allocate memory for sequence array.\n");
        exit(0);
        }

    printf("Executing ...\n");
    if(test == 1) /* scan database using fast file read */
      getdbseq();
    else          /* scan database using the slower getseq */
      getdata();

    if (efil!=stdout)
      fclose(efil);
    if(rfil!=stdout)
      fclose(rfil);
    fclose(lfil);
    free(dseq);

#if (VAX)
    endwin();
#endif

    return(0);
}


interractive()
{
    cls();
    puts("MOTIF ");
    puts("~~~~~");
    printf("   A utility to search for a given sequence within another");
    printf(" sequence or set of\nsequences.\n");

    while(*qseq == 0){
         printf("\nMotif ?: ");
         gets(qseq);
         }
    qlength = strlen(qseq);
    if(qlength >= MAXQ){
        printf("ERROR: motif %s too long: must be less than %d characters.\n"
        ,qseq,MAXQ);
        exit(0);
        }

    while(*lname == 0){
         printf("Sequence to be searched ?: ");
         gets(lname);
         }

    printf("Results file name (Default: %s) ?: ",RESFILE);
    gets(rname);
    if (*rname == 0) strcpy(rname,RESFILE);

    printf("Errorlog file name (Default: %s) ?: ",ERRLOG);
    gets(ename);
    if (*ename == 0) strcpy(ename,ERRLOG);
}


getnames(argc,argv)
   int argc;
   char *argv[];
{
    int i, k;
    char key, arg[MAXQ], *eq;
    char opt[20];
    char *calloc();

    if((eq=calloc(MAXQ,sizeof(char))) == 0){
        fprintf(efil,"ERROR: cannot allocate array.\n");
        exit(0);
        }

    i = 1;
    option = 0;

    strcpy(rname,RESFILE);
    strcpy(ename,ERRLOG);
    if (argc == 2) helpscreen();
    else if (argc > 2) {
         while( i < argc) {
              strcpy(arg, argv[i]);
              if (arg[0] == '(') {
                   i++;
                   strcpy(opt,argv[i]);
                   opt[0] = toupper(opt[0]);
                   if (opt[0] == 'B') option = 2;
                   else if (opt[0] == 'C') option = 1;
                   else option = 0;
                   break;
                   }
              else {
                   eq = strchr(arg,'=');
                   strncpy(&key,arg,1);
                   key = toupper(key);
                   }
              switch (key) {


                   case 'O':             /* get results file */
                   strcpy(rname, eq+1 );
                   for(k = 1;k <= 2; k++) {
                        if(i+1 >= argc) break;
                        if(strchr(argv[i+1],'=')) break;
                        if(strchr(argv[i+1],'(')) break;
                        else {
                             strcpy( arg, argv[i + 1]);
                             i++;
                             strcat(rname," ");
                             strcat(rname, arg);
                             }
                        }
                   break;

                   case 'M':              /* get query sequence */
                   strncpy(qseq, eq+1,MAXQ);
                   break;

                   case 'S':        /* get sequence to be searched */
                   strcpy(lname, eq+1);
                   for(k = 1;k <= 2; k++) {
                        if(i+1 >= argc) break;
                        if(strchr(argv[i+1],'=')) break;
                        if(strchr(argv[i+1],'(')) break;
                        else {
                             strcpy( arg, argv[i + 1]);
                             i++;
                             strcat(lname," ");
                             strcat(lname, arg);
                             }
                        }
                   break;

                   case 'E':        /* get sequence to be searched */
                   strcpy(ename, eq+1);
                   for(k = 1;k <= 2; k++) {
                        if(i+1 >= argc) break;
                        if(strchr(argv[i+1],'=')) break;
                        if(strchr(argv[i+1],'(')) break;
                        else {
                             strcpy( arg, argv[i + 1]);
                             i++;
                             strcat(ename," ");
                             strcat(ename, arg);
                             }
                        }
                   break;

              default:
                   printf("WARNING: Unknown argument %s ; ",argv[i]);
              }
              i++;
         }
    }

    qlength = strlen(qseq); /* set initial length of query sequence */
    if (qlength == 0) {
         printf("ERROR: no motif");
         exit(0);
         }
    if(qlength >= MAXQ){
        printf("ERROR: motif %s too long: must be less than %d characters.\n"
        ,qseq,MAXQ);
        exit(0);
        }
}

/* this routine sets up a translation table, used when reading sequence data,
   that forces all lower case chars to upper case, and U to T.  This routine
   does not rely on ASCII character encoding
*/
settranstable()
{
    int i;
    char high[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ*-@";
    char low[]  = "abcdefghijklmnopqrstuvwxyz*-@";

    for(i=0;i<256;i++)
         uptable[i] = 0;
    for(i=0;i<29;i++)
       uptable[high[i]] = uptable[low[i]] = (char)high[i];
    uptable['u'] = uptable['U'] = (char)'T';
}

/* This sets up translation used in defining motif for lower case letters
   which are used as the IUB abbreviations for nucleotides.  This routine
   effectively translates lower case IUB symbols into our motif definition
   format; eg r become [AG].
*/
iub()
{
    int i,len;
    char qtemp[MAXMATRIX];

    char *r = "[AG]";
    char *y = "[TC]";
    char *m = "[AG]";
    char *k = "[TC]";
    char *s = "[CG]";
    char *w = "[AT]";
    char *h = "[ACT]";
    char *b = "[CGT]";
    char *v = "[ACT]";
    char *d = "[AGT]";

    *qtemp = 0;
    len = strlen(qseq);
    for(i=0;i<len;i++){
        if(islower(qseq[i]) == 0) strncat(qtemp,qseq+i,1);
         else switch(qseq[i]) {

              case 'r':
                  strcat(qtemp,r);
                  break;

              case 'y':
                  strcat(qtemp,y);
                  break;

              case 'm':
                  strcat(qtemp,m);
                  break;

              case 'k':
                  strcat(qtemp,k);
                  break;

              case 's':
                  strcat(qtemp,s);
                  break;

              case 'w':
                  strcat(qtemp,w);
                  break;

              case 'h':
                  strcat(qtemp,h);
                  break;

              case 'b':
                  strcat(qtemp,b);
                  break;

              case 'v':
                  strcat(qtemp,v);
                  break;

              case 'd':
                  strcat(qtemp,d);
                  break;

              case 'n':
                  strcat(qtemp,"X");
                   break;

              case 'x':
                  strcat(qtemp,"X");
                  break;

              default :
                  fprintf(efil,"ERROR: unrecognised IUB symbol %c.\n",qseq[i]);
                  exit(0);
                  }
         }
     strcpy(qseq,qtemp);
     qlength = strlen(qseq);
}

openfiles()
{
    if(strcmp(ename,"screen")) {
        efil = fopen(ename,"w");
        if (efil == 0) {
            printf("ERROR: cannot open error log file %s\n",ename);
            exit(0);
            }
        }
    else
        efil = stdout;

    if(strcmp(rname,"screen")) {
        rfil = fopen(rname,"w");
        if (rfil == 0) {
            fprintf(efil,"ERROR: cannot open results file %s \n",rname);
            exit(0);
            }
        }
    else
        rfil = stdout;

    lfil = fopen(lname,"r");
    if (lfil == 0) {
         fprintf(efil,"ERROR: cannot open sequence file %s\n",lname);
         exit(0);
        }
#ifndef waterloo
    if (setvbuf(lfil, NULL, _IOFBF, 32767) != 0) {
       fprintf(efil,"WARNING: insufficient memory for large database buffer\n");
       }
#endif

    fprintf(rfil,"Motif = %s\n",qseq);
    fprintf(rfil,"File to be searched = %s\n",lname);
    fprintf(rfil,"Results file = %s\n\n",rname);
}

makesetmatrix()        /* set up set membership matrix */
{
    int i, j, bracket, k, count, not;
    char temp[MAXQ];

    count = bracket = not = left = right = 0;

    if (qseq[0]=='.'){
        left = 1;       /* left end marker */
        strcpy(temp,qseq+1);
        *qseq = 0;
        strcpy(qseq,temp);
        qlength--;
        }

    if (qseq[strlen(qseq)-1]=='.'){
        right = 1;       /* right end marker */
        strncpy(temp,qseq,strlen(qseq)-1);
        temp[strlen(qseq)-1] = '\0';
        *qseq = 0;
        strcpy(qseq,temp);
        qlength--;
        }

    for(i = 0;i<MAXMATRIX;i++)
         for(j=0;j<ARRSIZ;j++)
              setmatrix[j][i] = 0;        /* set whole setmatrix to zero */

    for(i=0; i<qlength; i++) {        /* set values in qseq to 1 */
         if (count > MAXMATRIX) break;
         if (qseq[i] == '^') {
              not = 1;
              continue;
              }
         if (qseq[i] == '[') {       /* if a bracket occurs, do not */
              bracket = 1;           /* increment the counter */
              continue;
              }
         if (qseq[i] ==']') {
              bracket = 0;
              if ((bracket == 0) &&(not == 1)) not = donot(not,count+1);
              count++;
              continue;
              }
        if((qseq[i] == 'X') || (qseq[i] =='-')){
              for(k=0;k<ARRSIZ;k++) {
                    setmatrix[k][count] = 1;
                    }
              }
         j = (int)qseq[i];
         if (bracket == 0) {
              setmatrix[j][count] = 1;
              count++;
              }
         if (bracket  == 1) {
              setmatrix[j][count] = 1;
              continue;
              }
         if (not == 1) not = donot(not,count);  /*if a not(^) occurs,*/
         }                                  /* invert the column of the*/
         qlength = count;                   /* setmatrix */

     /* adjust qlength to account for brackets */

     if(bracket == 1) {
         fprintf(efil,"ERROR: bracket not closed in query motif.\n");
         exit(0);
        }

     if(qlength >= MAXMATRIX){
        fprintf(efil,"ERROR: motif too long, must have less than %d elements\n",
         MAXMATRIX);
         exit(0);
         }
}

donot(not,count)      /* inverts a column of the setmatrix */
   int not, count;
{
    int k;
    for (k=0;k<256;k++) {
         if (setmatrix[k][count-1] == 0) setmatrix[k][count-1] = 1;
         else if (setmatrix[k][count-1] == 1) setmatrix[k][count-1] = 0;
         }
    not = 0;
    return (not);
}

/* check to see if sequence file is a SCANFILE - filename has an extension of
   .SCA (MS-DOS) or filetype of SCANFILE (CMS).  Routine returns either 0 or 1
*/
databasecheck(name)
   char name[];
{
    int i, test;
    char *type;

    test = 0;

    type = strchr(name,' ');
    if (type == 0) type = strchr(name,'.');
    if (type == 0) return(0);

    for(i=1;i<=strlen(SCANNAME);i++)
         type[i] = uptable[type[i]];

    if(strncmp(type+1,SCANNAME,strlen(SCANNAME)) == 0) test++;

    return(test);
}

getdbseq()
{
    char string[MAXLINE];
    int test,total;
    total = totalseq = 0;

    fgets(string,MAXLINE,lfil);     /* read in first name */
    if (string[0] == '>') strncpy(dbname,string,70);
    else {
         fprintf(efil,"Warning: SCANFILE in unrecognised format\n");
         test = 0;
         }
    if(test == 0) return(0);
    while ((dlength = getlib(string)) != 0) {
         if (dlength >= qlength) {

              compare(0);
              total = total + matches;
              }
         }
    if (total == 0) fprintf(rfil,"no matches found\n");
    else fprintf(rfil,"\ntotal of %d matches in %d sequences\n"
           ,total,totalseq);
}


getdata()
{
    int total;
    totalseq = total = 0;

    while ((dlength = getseq()) != 0) {
              if (dlength >= qlength){
                   compare(0);
                   total = total + matches;
                   }
              }

    if (total == 0) fprintf(rfil,"no matches found\n");
    else fprintf(rfil,"\ntotal of %d matches in %d sequences\n"
           ,total,totalseq);
}

getlib(string)              /* read in SCANFILE format sequences */
   char string[];
{
    int i, len, n;

    n = 0;
    if (string[0] == '>') strncpy(dbname,string+1,70);
    if(*dbname != 0)
        if(strlen(dbname) >= 70) dbname[70] = '\n';

    while(fgets(string,MAXLINE,lfil) != 0) {
         if (string[0] == '>') {
              break;
             }
        len = strlen(string) - 1;
        for(i=0;i<len;i++){
#if (waterloo)
        if(string[i]==' ') break;
#endif
             if(n==MAXSEQ) break;
             dseq[n++] = string[i];
             }
        }

    if(n==MAXSEQ)
       fprintf(efil,"\nWARNING: sequence %s over %d long; truncated\n",
       dbname,MAXSEQ);

    return(n);
}

getseq()       /* get a single sequence in any of the supported formats */
{
    int i, j, len;
    int embl, genbank, staden, fastp, codata;
    char c, temp[MAXLINE];
    int offset;

    embl = genbank = staden = fastp = codata = FALSE;
    offset = j = 0;

    if(fgets(temp,MAXLINE,lfil) == 0) return(0);
    if ((temp[0] == '<')&&(temp[19] == '>')) staden = TRUE;
    else if(strncmp(temp,"ID   ",5) == 0) embl = TRUE;
    else if(strncmp(temp,"LOCUS     ",10) == 0) genbank = TRUE;
    else if(strncmp(temp,"ENTRY",5) == 0) codata = TRUE;
    else if (temp[0] == '>') fastp = TRUE;

    if (embl){
        strncpy(dbname,temp+5,70);
         while(temp[0] != ' ')
              fgets(temp,MAXLINE,lfil);
         }

    if (genbank){
         while(strncmp(temp,"ORIGIN",6) !=0){
              fgets(temp,MAXLINE,lfil);
              if(strncmp(temp,"DEFINITION",10) == 0)
              strncpy(dbname,temp+12,70);
              }
         fgets(temp,MAXLINE,lfil);
         }

    if(codata){
         strncpy(dbname,temp+6,70);
         while(strncmp(temp,"SEQUENCE",8) != 0)
              fgets(temp,MAXLINE,lfil);
         fgets(temp,MAXLINE,lfil);
         }

    if (fastp){
         strncpy(dbname,temp+1,70 );
         fgets(temp,MAXLINE,lfil);
         }

    if(staden){
         strncpy(dbname,temp+1,18);
         offset = 20;
         }

    if(*dbname != 0)
        if(strlen(dbname) >= 70) dbname[70] = '\n';

    do{
         if(strncmp(temp,"//",2) == 0) break;
         len = strlen(temp);
         for(i=offset;i<len;i++){
             if(temp[i] == '@'){
                 while (fgets(temp,MAXLINE,lfil) != 0) ;
                 return(j);
                 }
	      if(j==MAXSEQ) break;
	      if((c = uptable[temp[i]]) > 0) dseq[j++] = c;
              else if(temp[i] == '\n')break;
              }
         if(staden) offset = 0;
         if(fgets(temp,MAXLINE,lfil) == 0) break;
         }
    while (TRUE);

   if(j==MAXSEQ)
       fprintf(efil,"\nWARNING: sequence %s over %d long; truncated\n",
       dbname,MAXSEQ);
    dseq[j+1] = '\0';
    return(j);
}

compare()
{
    int i, j, total, k, len;

    matches = 0;
    dlength = dlength + 1;

    /* add one to total for each match, as soon as a match does not occur
    break out of the loop */
    len = dlength-qlength;
    for(i=0;i<len;i++) {
        if((left==1)&&(i==1)) break;
        if(right==1) i=len-1;
        for(j=0,total=0;j<qlength;j++) {
            total = total + setmatrix[(int)dseq[i+j]][j];
            if (total <= j) break;
            }

/* report a match only where the total = length of query sequence */
    if (total == qlength) {
         matches = matches + 1;
         if (matches == 1) fprintf(rfil,"\n%s",dbname);
         if((matches == 1)&&(option == 1)) fprintf(rfil,"matches at : ");
         if (option == 1) fprintf(rfil,"%d  ",i+1);
         if ((option == 0)&& (matches > 0)) {
              if (matches > 1)  fprintf(rfil,"\n");
              fprintf(rfil,"match at %d, sequence matched: ",i+1);
             for (k=i;k<i+qlength;k++)
                   fprintf(rfil,"%c",dseq[k]);
              }
         }
    }
    if(matches > 0) {
         if (option != 2 ) fprintf(rfil,"\n");
         fprintf(rfil,"total matches: %d \n",matches);
         totalseq++;
         }
}

helpscreen()
{
    cls();
    printf("MOTIF ");
    printf("\n~~~~~\n");
    printf("   A utility to search for a given motif within another sequence or set of\n");
    printf("sequences.  The motif is defined as single elements;  motif=ABCD, or\n");
    printf("bracketed elements motif=AB[DEF]C, where the third element can be one of D,\n");
    printf("E or F.  A ^ before an element signifies 'not'.  A - or X matches anything.\n\n");
    printf("A dot (.) anchors the search to the start (or end) of the sequence.\n\n");
    printf("   PARAMETERS\n   ~~~~~~~~~~\n");
    printf("   MOTIF=       The motif searched for           Mandatory parameter\n");
    printf("   SEQ=         The sequence searched through    Mandatory parameter\n");
    printf("   OUT=         The results file ");
    printf("                DEFAULT: %s\n",RESFILE);
    printf("   ERROR=       File to receive errors           DEFAULT: ");
    printf("%s (screen)\n",ERRLOG);
    printf("\n   (options     Output options.                  DEFAULT: VERBOSE");
    printf("\n\n");
    printf("      BRIEF  : only the number of matches in a sequence are report.\n");
    printf("      COMPACT: number and positions of matches are reported.\n");
    printf("      VERBOSE: number, positions and sequences of matches");
    printf(" are reported.\n");
    exit(0);
}
