[Previous] [Up]
Go backward to Header file
Go up to Input Module  

Executable Code

File name: read.c

/* Routines for reading the oligo file. */
 
#include <stdio.h>
#include <string.h>
#include "bases.h"
#include "struct.h"
 
/* ----------------------------------------------------------------------
   Extract next whitespace-terminated word from a string.
 
   Given access to a pointer into a string, return a new string
   consisting of everything up to (but not including) the next
   whitespace character, and advance the given pointer to that
   character.
 
   */
 
static char *word(pp)
     char **pp;
{
  char *p = *pp;
  char *w;
  char *q = p;
 
  /* Find next whitespace character, have q point to it. */
  while(*q != ' ' && *q != '\t' && *q != '\0') q++;
 
  /* Allocate correctly-sized chunk with room for trailing '\0'. */
  w = (char *)malloc(q-p+1);
 
  strncpy(w,p,q-p);             /* Copy the word to the new memory. */
  w[q-p]='\0';                  /* Zero-terminate the new string. */
  *pp = q;                      /* Advance input pointer past the word. */
  return w;                     /* Return the word-string. */
}
 
 
/* ----------------------------------------------------------------------
   Look up a symbol in the symbol table.
 
   Given a string and the symbol table, looks up the base-sequence
   named by the string and, if found, returns it (in *pbases and
   *pnbases).  If the string is found, *found is set to 1, otherwise
   the null sequence is returned and *found is 0.
 
   */
 
static void lookup ( w, syms, nSyms, pbases, pnbases, found ) 
     char *w;
     Sym *syms;
     int nSyms;
     int **pbases;
     int *pnbases;
     int *found;
{
  int i;
  /* Straightforward linear search for the given symbol word.
     Return upon finding first match. */
  for(i=0;i<nSyms;i++){
    if (0==strcmp(w,syms[i].name)) {
      *pbases = syms[i].bases;  /* Set output variables. */
      *pnbases = syms[i].len;
      *found = 1;
      return;
    }
  }
  /* Failure; output is set to the null base-sequence. */
  *pbases = 0;
  *pnbases = 0;
  *found = 0;
  return;
}
 
 
/* ----------------------------------------------------------------------
   Add a base to the end of a growing base sequence.
 
   Given a dynamically-allocated base sequence, append a single base
   onto the end, dynamically resizing/relocating the sequence when
   necessary.
 
   */
 
static void add_base (base, pbases, pnbases, parsz)
     int base;
     int **pbases;
     int *pnbases;
     int *parsz;
{
  /* If no more room, resize array before proceeding. */
  if (*pnbases == *parsz) {
    if (*pnbases == 0) {
      /* If null array, do initial allocation. */
      *pbases = (int *)malloc(sizeof(int)*(*parsz = 1));
    } else {
      /* Otherwise, double the array size. */ 
      *pbases = (int *)realloc(*pbases,sizeof(int)*(*parsz *= 2));
    }
  }
  (*pbases)[(*pnbases)++] = base; /* Append the base to end of sequence. */
}
 
 
/* ----------------------------------------------------------------------
   Read a base sequence from a string of base letters and symbol names.
 
   Given a string and a symbol table, translates the string into a new
   base sequence, which is returned in *pbases and *pnbases.  Spaces
   and tabs are used to terminate the names of symbols but are
   otherwise ignored.  Unknown base letters or symbol names are
   ignored.
 
   Special new feature added in this version: "!symname" is the
   reverse complement of "symname".  That is, a '!' character followed
   by a symbol name stands for a sequence which is the reverse
   Watson-Crick complement of the meaning of the symbol.  
 
   Another new feature: if the string starts with "3'" it is taken
   to be given in reverse (3'->5') order.
 
    */
 
static void read_bases_and_syms ( p, syms, nSyms, pbases, pnbases)
     char *p;
     Sym *syms;
     int nSyms;
     int **pbases;
     int *pnbases;
{
  int bArSz = 0;                /* Keeps track of size of base array. */
  int reverse = 0;              /* 1 = Reading sequence in reverse order. */
 
  *pbases = 0;                  /* Initialize result to null base sequence. */
  *pnbases = 0;
 
  while(*p == ' ' || *p == '\t') p++;   /* Skip over tabs and spaces. */
  if (*p == '3' && p[1] == '\'') { /* If string starts with "3'" */
    reverse = 1;               /* Plan to read it in reverse order (3'->5'). */
    p+=2;                      /* Skip over "3'" */
  }
  while(*p) { /* Until end of string... */
    char *w;
    int *b,nb;
    int i;
    int found;
    while(*p == ' ' || *p == '\t') p++; /* Skip over tabs and spaces. */
    if (*p == '\0') /* Nothing left. */ return;
    w = word(&p); /* Extract next word from string (up to whitespace). */
    lookup ( w, syms, nSyms, &b, &nb, &found ); /* Look it up in table. */
    if (found) { /* Add symbol bases onto end of result. */
      if (!reverse) { /* Add them in the normal order (5'->3'). */
        for(i=0;i<nb;i++)
          add_base (b[i], pbases, pnbases, &bArSz);
      } else { /* Add them in reverse order (3'->5'). */
        for(i=nb-1;i>=0;i--)
          add_base (b[i], pbases, pnbases, &bArSz);
      }
    } else if (*w == '!') { /* May be a reverse-complement of a symbol. */
      lookup (w+1, syms, nSyms, &b, &nb, &found); /* Look up sym in table. */
      if (found) { /* Add reverse-complemented bases onto end of result. */
        if (!reverse) {
          /* Add bases in normal order, which for the reverse-complement
             of a symbol, means 3'->5', and complemented. */
          for(i=nb-1;i>=0;i--)
            add_base (b[i]^2, pbases, pnbases, &bArSz);
        } else {
          /* Add bases in reverse order, which for the reverse-complement
             of a symbol, means 5'->3', and complemented. */
          for(i=0;i<nb;i++)
            add_base (b[i]^2, pbases, pnbases, &bArSz);
        }
      }
    }
    if (!found) { /* Assume word is just a sequence of base letters. */
      int l = strlen(w);
      for(i=0;i<l;i++){ /* For each character, */
        int j;
        for(j=0;j<4;j++){       /* compare it with known base letters, */
          if (base_names[j] == w[i]) /* if match, add base to result. */
            add_base (j, pbases, pnbases, &bArSz);
        }
      }
    }
    free(w);                    /* This was allocated in word() above. */
  }
  if (reverse) {
    /* We built the whole thing in 3'->5' order, and so we now need to
       reverse it again, so the result we return will be 5'->3'.
       We'll do the reverse in-place. */
    int n = *pnbases;
    int *b = *pbases;
    int midpoint = n>>1; /* Index of middle base if n odd; just past
                            middle if n even. */
    int i=0;
    for(;i<midpoint;i++){ /* Up to middle, */
      int t = b[i];       /* Swap base with the one at the other end. */
      b[i] = b[(n-1)-i];
      b[(n-1)-i] = t;
    }
  }
}
 
 
/*
   ----------------------------------------------------------------------
   Read a set of oligos, a symbol table, and an initial DNA sequence
   from a file.
 
   The file format is line-based and consists of three kinds of lines:
   comments, symbol definitions, and DNA strands.  Whitespace is
   ignored except for the purpose of terminating a symbol name.  A
   comment is any line whose first nonwhitespace character is "#"; the
   entire line is ignored.  A definition is any line whose first
   nonwhitespace character is ":", followed by a symbol name and a
   meaning.  Any line that is not a comment or a definition and
   contains nonwhitespace characters is considered a strand.  The
   first strand in the file is taken to be the initial strand.  The
   remaining strands are oligos.  Symbol meanings and strands are
   given by strings consisting of the letters "ACTG" and/or
   previously-defined symbol names terminated by whitespace; anything
   else is ignored.  If a symbol name is preceded by "!" then that is
   taken to mean the reverse complement of the given symbol.  The
   order of appearance of sequence information is always considered to
   be 5' to 3', unless the sequence string begins with the string "3'"
   in which case it is understood to be given in 3' to 5' order.
   However, symbol meanings and strands are always *stored* in 5' to
   3' order.  The lengths of lines, symbol names, symbol meanings, and
   base sequences are constrained only by the memory limits of the
   machine.
 
   */
 
void read_oligos ( filename, poligos, pnOligos, psyms, pnSyms,
                  piniDNA, piniDNAlen )
 
     char *filename;
 
     Oligo **poligos;
     int *pnOligos;
     Sym **psyms;
     int *pnSyms;
     int **piniDNA;
     int *piniDNAlen;
{
  Oligo *oligos;
  int nOligos = 0;
  Sym *syms;
  int nSyms = 0;
  int *iniDNA = 0;
  int iniDNAlen = 0;
 
  int oArSz, sArSz, iArSz;
  FILE *stream;
  char *linebuf = (char *)malloc(1000); /* Initial size of line buffer. */
  int lArSz = 1000;
  int gotIniYet = 0;
 
  /* Attempt to open the file for reading. */
  if ((stream = fopen(filename,"r")) == 0) {
    fprintf(stderr,"Couldn't open %s.\n",filename);
    perror("fopen");
    exit(1);
  }
 
  /* Allocate the three result arrays, with a small initial size. */
  oligos = (Oligo *)malloc(sizeof(Oligo)*(oArSz = 1));
  syms = (Sym *)malloc(sizeof(Sym)*(sArSz = 1));
  iniDNA = (int *)malloc(sizeof(int)*(iArSz = 1));
 
  while (!feof(stream)) { /* Until end of file, */
    char *p;
 
    /* Read entire line into linebuf. */
    if (0==fgets(linebuf, lArSz, stream)) break;
    /* If ran out of space, allocate more and keep going. */
    while (strlen(linebuf) == lArSz-1) {
      linebuf = (char *)realloc(linebuf,lArSz*2);
      fgets(linebuf+(lArSz-1), lArSz+1, stream);
      lArSz *= 2;
    }
    /* Change terminal newline (if any) to null. */
    if (linebuf[strlen(linebuf)-1] == '\n')
      linebuf[strlen(linebuf)-1] = '\0';
 
    p = linebuf;
    while (*p == ' ' || *p == '\t') p++; /* Skip initial whitespace. */
    if (*p == '#') continue; /* If comment char, skip rest of line. */
    if (*p == ':') { /* This means to  a new symbol. */
      char *w;
      int *bases;
      int nbases;
      p++;
      w = word(&p); /* Read the next word, up to whitespace.  This
                       will be the name of the symbol. */
      /* Using current symbol table, interpret the remainder of the
         line as a base sequence, to be the meaning of the symbol. */
      read_bases_and_syms ( p, syms, nSyms, &bases, &nbases );
      if (nbases == 0)
        printf("Warning: Symbol %s has empty definition.\n",w);
      /* If out of room in symbol table, double its size. */
      if (nSyms == sArSz)
        syms = (Sym *)realloc(syms,sizeof(Sym)*(sArSz*=2));
      syms[nSyms].name = w;     /* Add the symbol to the table. */
      syms[nSyms].len = nbases;
      syms[nSyms].bases = bases;
      nSyms++;
    } else if (*p != '\0') { /* If line has stuff, interpret as a strand. */
      /* If we haven't seen any strands yet, take this one to be the
         initial strand. */
      if (!gotIniYet) {
        /* Read it out, using current symbol table. */
        read_bases_and_syms ( p, syms, nSyms, &iniDNA, &iniDNAlen );
        if (iniDNAlen == 0) {
          printf("Error: Empty initial DNA strand.\n");
          exit(1);
        }
        gotIniYet = 1;
      } else { /* Not the initial strand, consider it an oligo. */
        /* If no more room in oligo array, double its size. */
        if (nOligos == oArSz)
          oligos = (Oligo*)realloc(oligos,sizeof(Oligo)*(oArSz*=2));
        /* Initialize new oligo to null oligo. */
        oligos[nOligos].bases = 0;
        oligos[nOligos].len = 0;
        /* Read the oligo using our current symbol table. */
        read_bases_and_syms ( p, syms, nSyms,
                             &oligos[nOligos].bases,
                             &oligos[nOligos].len );
        if (oligos[nOligos].len == 0) {
          /* If nothing on the line was understood, ignore the whole line. */
          printf("Warning: Read empty oligo.\n");
        } else {
          /* Otherwise, remember that olig and go on. */
          nOligos++;
        }
      }
    }
  }
 
  /* Free the line buffer; it might have grown large. */
  free(linebuf);
 
  /* Return the results to our caller. */
  *poligos = oligos;
  *pnOligos = nOligos;
  *psyms = syms;
  *pnSyms = nSyms;
  *piniDNA = iniDNA;
  *piniDNAlen = iniDNAlen;
}

- Michael P. Frank, September 12, 1995. Formatted using HyperLaTeX-1.3.

[Previous] [Up]