File name: read.c
/* Routines for reading the oligo file. */
#include <stdio.h>
#include <string.h>
#include "bases.h"
#include "struct.h"
/* ----------------------------------------------------------------------
Extract next whitespace-terminated word from a string.
Given access to a pointer into a string, return a new string
consisting of everything up to (but not including) the next
whitespace character, and advance the given pointer to that
character.
*/
static char *word(pp)
char **pp;
{
char *p = *pp;
char *w;
char *q = p;
/* Find next whitespace character, have q point to it. */
while(*q != ' ' && *q != '\t' && *q != '\0') q++;
/* Allocate correctly-sized chunk with room for trailing '\0'. */
w = (char *)malloc(q-p+1);
strncpy(w,p,q-p); /* Copy the word to the new memory. */
w[q-p]='\0'; /* Zero-terminate the new string. */
*pp = q; /* Advance input pointer past the word. */
return w; /* Return the word-string. */
}
/* ----------------------------------------------------------------------
Look up a symbol in the symbol table.
Given a string and the symbol table, looks up the base-sequence
named by the string and, if found, returns it (in *pbases and
*pnbases). If the string is found, *found is set to 1, otherwise
the null sequence is returned and *found is 0.
*/
static void lookup ( w, syms, nSyms, pbases, pnbases, found )
char *w;
Sym *syms;
int nSyms;
int **pbases;
int *pnbases;
int *found;
{
int i;
/* Straightforward linear search for the given symbol word.
Return upon finding first match. */
for(i=0;i<nSyms;i++){
if (0==strcmp(w,syms[i].name)) {
*pbases = syms[i].bases; /* Set output variables. */
*pnbases = syms[i].len;
*found = 1;
return;
}
}
/* Failure; output is set to the null base-sequence. */
*pbases = 0;
*pnbases = 0;
*found = 0;
return;
}
/* ----------------------------------------------------------------------
Add a base to the end of a growing base sequence.
Given a dynamically-allocated base sequence, append a single base
onto the end, dynamically resizing/relocating the sequence when
necessary.
*/
static void add_base (base, pbases, pnbases, parsz)
int base;
int **pbases;
int *pnbases;
int *parsz;
{
/* If no more room, resize array before proceeding. */
if (*pnbases == *parsz) {
if (*pnbases == 0) {
/* If null array, do initial allocation. */
*pbases = (int *)malloc(sizeof(int)*(*parsz = 1));
} else {
/* Otherwise, double the array size. */
*pbases = (int *)realloc(*pbases,sizeof(int)*(*parsz *= 2));
}
}
(*pbases)[(*pnbases)++] = base; /* Append the base to end of sequence. */
}
/* ----------------------------------------------------------------------
Read a base sequence from a string of base letters and symbol names.
Given a string and a symbol table, translates the string into a new
base sequence, which is returned in *pbases and *pnbases. Spaces
and tabs are used to terminate the names of symbols but are
otherwise ignored. Unknown base letters or symbol names are
ignored.
Special new feature added in this version: "!symname" is the
reverse complement of "symname". That is, a '!' character followed
by a symbol name stands for a sequence which is the reverse
Watson-Crick complement of the meaning of the symbol.
Another new feature: if the string starts with "3'" it is taken
to be given in reverse (3'->5') order.
*/
static void read_bases_and_syms ( p, syms, nSyms, pbases, pnbases)
char *p;
Sym *syms;
int nSyms;
int **pbases;
int *pnbases;
{
int bArSz = 0; /* Keeps track of size of base array. */
int reverse = 0; /* 1 = Reading sequence in reverse order. */
*pbases = 0; /* Initialize result to null base sequence. */
*pnbases = 0;
while(*p == ' ' || *p == '\t') p++; /* Skip over tabs and spaces. */
if (*p == '3' && p[1] == '\'') { /* If string starts with "3'" */
reverse = 1; /* Plan to read it in reverse order (3'->5'). */
p+=2; /* Skip over "3'" */
}
while(*p) { /* Until end of string... */
char *w;
int *b,nb;
int i;
int found;
while(*p == ' ' || *p == '\t') p++; /* Skip over tabs and spaces. */
if (*p == '\0') /* Nothing left. */ return;
w = word(&p); /* Extract next word from string (up to whitespace). */
lookup ( w, syms, nSyms, &b, &nb, &found ); /* Look it up in table. */
if (found) { /* Add symbol bases onto end of result. */
if (!reverse) { /* Add them in the normal order (5'->3'). */
for(i=0;i<nb;i++)
add_base (b[i], pbases, pnbases, &bArSz);
} else { /* Add them in reverse order (3'->5'). */
for(i=nb-1;i>=0;i--)
add_base (b[i], pbases, pnbases, &bArSz);
}
} else if (*w == '!') { /* May be a reverse-complement of a symbol. */
lookup (w+1, syms, nSyms, &b, &nb, &found); /* Look up sym in table. */
if (found) { /* Add reverse-complemented bases onto end of result. */
if (!reverse) {
/* Add bases in normal order, which for the reverse-complement
of a symbol, means 3'->5', and complemented. */
for(i=nb-1;i>=0;i--)
add_base (b[i]^2, pbases, pnbases, &bArSz);
} else {
/* Add bases in reverse order, which for the reverse-complement
of a symbol, means 5'->3', and complemented. */
for(i=0;i<nb;i++)
add_base (b[i]^2, pbases, pnbases, &bArSz);
}
}
}
if (!found) { /* Assume word is just a sequence of base letters. */
int l = strlen(w);
for(i=0;i<l;i++){ /* For each character, */
int j;
for(j=0;j<4;j++){ /* compare it with known base letters, */
if (base_names[j] == w[i]) /* if match, add base to result. */
add_base (j, pbases, pnbases, &bArSz);
}
}
}
free(w); /* This was allocated in word() above. */
}
if (reverse) {
/* We built the whole thing in 3'->5' order, and so we now need to
reverse it again, so the result we return will be 5'->3'.
We'll do the reverse in-place. */
int n = *pnbases;
int *b = *pbases;
int midpoint = n>>1; /* Index of middle base if n odd; just past
middle if n even. */
int i=0;
for(;i<midpoint;i++){ /* Up to middle, */
int t = b[i]; /* Swap base with the one at the other end. */
b[i] = b[(n-1)-i];
b[(n-1)-i] = t;
}
}
}
/*
----------------------------------------------------------------------
Read a set of oligos, a symbol table, and an initial DNA sequence
from a file.
The file format is line-based and consists of three kinds of lines:
comments, symbol definitions, and DNA strands. Whitespace is
ignored except for the purpose of terminating a symbol name. A
comment is any line whose first nonwhitespace character is "#"; the
entire line is ignored. A definition is any line whose first
nonwhitespace character is ":", followed by a symbol name and a
meaning. Any line that is not a comment or a definition and
contains nonwhitespace characters is considered a strand. The
first strand in the file is taken to be the initial strand. The
remaining strands are oligos. Symbol meanings and strands are
given by strings consisting of the letters "ACTG" and/or
previously-defined symbol names terminated by whitespace; anything
else is ignored. If a symbol name is preceded by "!" then that is
taken to mean the reverse complement of the given symbol. The
order of appearance of sequence information is always considered to
be 5' to 3', unless the sequence string begins with the string "3'"
in which case it is understood to be given in 3' to 5' order.
However, symbol meanings and strands are always *stored* in 5' to
3' order. The lengths of lines, symbol names, symbol meanings, and
base sequences are constrained only by the memory limits of the
machine.
*/
void read_oligos ( filename, poligos, pnOligos, psyms, pnSyms,
piniDNA, piniDNAlen )
char *filename;
Oligo **poligos;
int *pnOligos;
Sym **psyms;
int *pnSyms;
int **piniDNA;
int *piniDNAlen;
{
Oligo *oligos;
int nOligos = 0;
Sym *syms;
int nSyms = 0;
int *iniDNA = 0;
int iniDNAlen = 0;
int oArSz, sArSz, iArSz;
FILE *stream;
char *linebuf = (char *)malloc(1000); /* Initial size of line buffer. */
int lArSz = 1000;
int gotIniYet = 0;
/* Attempt to open the file for reading. */
if ((stream = fopen(filename,"r")) == 0) {
fprintf(stderr,"Couldn't open %s.\n",filename);
perror("fopen");
exit(1);
}
/* Allocate the three result arrays, with a small initial size. */
oligos = (Oligo *)malloc(sizeof(Oligo)*(oArSz = 1));
syms = (Sym *)malloc(sizeof(Sym)*(sArSz = 1));
iniDNA = (int *)malloc(sizeof(int)*(iArSz = 1));
while (!feof(stream)) { /* Until end of file, */
char *p;
/* Read entire line into linebuf. */
if (0==fgets(linebuf, lArSz, stream)) break;
/* If ran out of space, allocate more and keep going. */
while (strlen(linebuf) == lArSz-1) {
linebuf = (char *)realloc(linebuf,lArSz*2);
fgets(linebuf+(lArSz-1), lArSz+1, stream);
lArSz *= 2;
}
/* Change terminal newline (if any) to null. */
if (linebuf[strlen(linebuf)-1] == '\n')
linebuf[strlen(linebuf)-1] = '\0';
p = linebuf;
while (*p == ' ' || *p == '\t') p++; /* Skip initial whitespace. */
if (*p == '#') continue; /* If comment char, skip rest of line. */
if (*p == ':') { /* This means to a new symbol. */
char *w;
int *bases;
int nbases;
p++;
w = word(&p); /* Read the next word, up to whitespace. This
will be the name of the symbol. */
/* Using current symbol table, interpret the remainder of the
line as a base sequence, to be the meaning of the symbol. */
read_bases_and_syms ( p, syms, nSyms, &bases, &nbases );
if (nbases == 0)
printf("Warning: Symbol %s has empty definition.\n",w);
/* If out of room in symbol table, double its size. */
if (nSyms == sArSz)
syms = (Sym *)realloc(syms,sizeof(Sym)*(sArSz*=2));
syms[nSyms].name = w; /* Add the symbol to the table. */
syms[nSyms].len = nbases;
syms[nSyms].bases = bases;
nSyms++;
} else if (*p != '\0') { /* If line has stuff, interpret as a strand. */
/* If we haven't seen any strands yet, take this one to be the
initial strand. */
if (!gotIniYet) {
/* Read it out, using current symbol table. */
read_bases_and_syms ( p, syms, nSyms, &iniDNA, &iniDNAlen );
if (iniDNAlen == 0) {
printf("Error: Empty initial DNA strand.\n");
exit(1);
}
gotIniYet = 1;
} else { /* Not the initial strand, consider it an oligo. */
/* If no more room in oligo array, double its size. */
if (nOligos == oArSz)
oligos = (Oligo*)realloc(oligos,sizeof(Oligo)*(oArSz*=2));
/* Initialize new oligo to null oligo. */
oligos[nOligos].bases = 0;
oligos[nOligos].len = 0;
/* Read the oligo using our current symbol table. */
read_bases_and_syms ( p, syms, nSyms,
&oligos[nOligos].bases,
&oligos[nOligos].len );
if (oligos[nOligos].len == 0) {
/* If nothing on the line was understood, ignore the whole line. */
printf("Warning: Read empty oligo.\n");
} else {
/* Otherwise, remember that olig and go on. */
nOligos++;
}
}
}
}
/* Free the line buffer; it might have grown large. */
free(linebuf);
/* Return the results to our caller. */
*poligos = oligos;
*pnOligos = nOligos;
*psyms = syms;
*pnSyms = nSyms;
*piniDNA = iniDNA;
*piniDNAlen = iniDNAlen;
}