Logo Search packages:      
Sourcecode: epcr version File versions  Download package

fasta-io.cpp

///////////////////////////////////////////////////////////////////
//
//          Functions used by e-PCR to read FASTA files. 
//
///////////////////////////////////////////////////////////////////
// $Id: fasta-io.cpp,v 1.1 2003/03/31 21:08:27 rotmistr Exp $
///////////////////////////////////////////////////////////////////

#include "fasta-io.h"
#include "gdsutil.h"


char chrValidAa[] = "ABCDEFGHIKLMNPQRSTUVWXZ-*";
char chrValidNt[] = "TCAGNBDHKMRSVWY-";  
char stdCode[] = "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";

static char no_name[] = "????????";
static char no_title[] = "NO TITLE";
static char no_defline[] = "???????? NO TITLE";


/*

  want to add the ability to write mrna, CDS, and protein FASTA files

    name "Standard" ,
  name "SGC0" ,
  id 1 ,
  ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
  sncbieaa "---M---------------M---------------M----------------------------"
  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG


*/


///////////////////////////////////////////////////////////////
//
//
//          class FastaSeq
//
//

FastaSeq::FastaSeq ()
{
      m_tag = NULL;
      m_acc = NULL;
      m_def = NULL;
      m_seq = NULL; 
      m_len = 0;
}


FastaSeq::FastaSeq (char *def, char *seq)
{
      m_tag = NULL;
      m_acc = NULL;
      m_def = NULL;
      m_seq = NULL; 
      SetDefline(def);
      SetSequence(seq);
}


FastaSeq::~FastaSeq ()
{
      Clear(); 
}


const char * FastaSeq::Label() const
{
      return (m_tag==NULL) ? no_name : m_tag; 
}


const char * FastaSeq::Accession() const
{
      return (m_acc==NULL) ? no_name : m_acc; 
}


const char * FastaSeq::Defline () const
{
      return (m_def==NULL) ? no_defline : m_def; 
}


const char * FastaSeq::Title () const
{
      if (m_def==NULL) return no_title;
      char *p;
      if (p = strchr(m_def, ' '))
      {
            while (*p && isspace(*p))  p++;
            if (*p) return p;
      }
      return m_def; 
}


const char * FastaSeq::Sequence () const
{
      return m_seq; 
}


int FastaSeq::Length() const
{
      return m_len;
}


void FastaSeq::SetDefline (char *string)
{
      MemDealloc(m_tag);
      MemDealloc(m_acc);
      MemDealloc(m_def);
      m_def = string;
      if (string)
      {
            char *p = string;
            if (*p == ' ') p++;
            int n = strcspn(p," ");
            if (MemAlloc(m_tag,1+n))
            {
                  memcpy((void*)m_tag,p,n);
                  m_tag[n] = 0;
                  if (strncmp(m_tag,"gi|",3) ==0)
                  {
                        p = m_tag+3;
                        while (*p && *p != '|')  p++;
                        p++;
                        while (*p && *p != '|')  p++;
                        p++;
                        n = strlen(p);
                        MemAlloc(m_acc,1+n);
                        memcpy((void*)m_acc,p,n);
                        m_acc[n] = 0;
                        p = m_acc;
                        while (*p)
                        {
                              if (*p == '|') { *p=0; break; }
                              p++;
                        }
                  }
                  else
                  {
                        MemAlloc(m_acc,1+n);
                        strcpy(m_acc,m_tag);
                  }
            }
      }
}


void FastaSeq::SetSequence (char *string)
{
      MemDealloc(m_seq); 
      m_seq = string; 
      m_len = string ? strlen(string) : 0;
}


void FastaSeq::Clear()
{
      SetDefline(NULL); 
      SetSequence(NULL); 
}


bool FastaSeq::ParseText (const char *text, int upper, const char *alphabet)
{
      if (text==NULL || *text==0)
            return FALSE;

      
      const char *p2 = text;
      char *defline = NULL;
      char *sequence = NULL;
      if (*p2 == '>')
      {
            int n = strcspn((char*)text,"\r\n");
            p2 = text + n;
            n--;
            const char *p = text+1;
            while (p<p2 && isspace(*p))
            {
                  p++;
                  n--;
            }
            if (n>0 && MemAlloc(defline,1+n))
            {
                  memcpy((void*)defline,(void*)p,n);
                  defline[n] =0;
            }
      }
      
      if (MemAlloc(sequence,1+strlen(p2)))
      {
            char *p1 = sequence;
            char chr, chr2;
            while (*p2)
            {
                  chr = *p2;
                  if (isalpha(chr))
                  {
                        chr2 = toupper(chr);
                        if (upper) 
                              chr = chr2;
                        if (strchr(alphabet,chr2))
                              *p1++ = chr;
                  }
                  p2++;
            }
            *p1 = 0;          
            SetDefline(defline);
            SetSequence(sequence);
      }
      //return (m_len==0) ? FALSE : TRUE;
      return TRUE;
}


bool FastaSeq::ParseText (const char *text, int upper, int seqtype)
{
      if (seqtype == SEQTYPE_AA)
            return ParseText(text,upper,chrValidAa);

      if (seqtype == SEQTYPE_NT)
            return ParseText(text,upper,chrValidNt);

      PrintError("ParseText(text,code);   ERROR: Invalid code");
      return FALSE;
}



///////////////////////////////////////////////////////////////
//
//
//          class FastaFile
//
//

struct sidx
{
      char *acc;
      fpos_t fpos;
};


FastaFile::FastaFile ()
{
      m_file_name = NULL;
      m_file = NULL;
      m_seqtype = SEQTYPE_NT;
      m_index = NULL;
      m_count = 0;
      m_toupper = 0;
}


FastaFile::FastaFile (int seqtype)
{
      m_file_name = NULL;
      m_file = NULL;
      m_seqtype = seqtype;
      m_index = NULL;
      m_count = 0;
      m_toupper = 0;
}


FastaFile::~FastaFile ()
{
      if (m_file)  Close();
}


bool FastaFile::Open (const char *fname, const char *fmode)
{
      if (!fmode) return FALSE;

      if (m_file != NULL)
      {
            PrintError("FastaFile::Open();  WARNING: file already open");
            return FALSE;
      }

      if (fname)
      {
            m_file = fopen(fname,fmode);
      }
      else
      {
            m_file = (*fmode == 'w') ? stdout : stdin;
      }
      
      if (m_file == NULL)
      {
            PrintError("FastaFile::Open();  ERROR: unable to open file");
            return FALSE;
      }

      m_file_name = newstring(fname);
      return TRUE;
}


bool FastaFile::Close ()
{
      if (m_file == NULL)
      {
            PrintError("FastaFile::Close();  WARNING: file already closed");
            return FALSE;
      }

      if (m_file != stdin && m_file != stdout)  fclose(m_file);

      m_file = NULL;
      delete m_file_name;
      m_file_name =NULL;
      return TRUE;
}



int FastaFile::Attach (FILE *file)
{
      m_file = file;
      return 1;
}


FILE* FastaFile::Detach ()
{
      FILE *f = m_file;
      m_file = NULL;
      return f;
}


/****

long FastaFile::Tell ()
{
      return ftell(m_file);
}


int  FastaFile::Seek (long pos, int whence)
{
      if (fseek(m_file,pos,whence) !=0)
      {
            fprintf(stderr,"FastaFile::Seek() ERROR\n");
            return 0;
      }
      return 1;
}

****/


int  FastaFile::SetFilePos (fpos_t &fpos)
{
      if (fsetpos(m_file,&fpos) !=0)
      {
            fprintf(stderr,"FastaFile::SetFilePos() ERROR\n");
            return 0;
      }
      return 1;
}


int  FastaFile::GetFilePos (fpos_t &fpos)
{
      if (fgetpos(m_file,&fpos) !=0)
      {
            fprintf(stderr,"FastaFile::GetFilePos() ERROR\n");
            return 0;
      }
      return 1;
}


extern "C" int sidx_cmp (const void *ptr1, const void *ptr2)
{
      struct sidx *s1 = (struct sidx*) ptr1;
      struct sidx *s2 = (struct sidx*) ptr2;
      return strcmp(s1->acc,s2->acc);
}


int  FastaFile::BuildSeqIndex ()
{
      int slots = 1024;
      int cnt = 0;
      sidx *arr;

      MemAlloc(arr,slots*sizeof(struct sidx));

      char tmp1[48];
      char tmp2[16];
      FastaSeq faseq;
      fpos_t fpos;
      if (!GetFilePos(fpos)) fprintf(stderr,"screwed\n");

      while (Read(faseq))
      {
            if (cnt >= slots-1)
            {
                  slots *= 2;
                  if (!MemResize(arr,slots*sizeof(struct sidx)))
                  {
                        fprintf(stderr,"Out of memory\n");
                        exit(0);
                  }
            }

            strcpy(tmp1,faseq.SeqIdent());
            ExtractSeqAcc(tmp1,tmp2);

            arr[cnt].acc = newstring(tmp1);
            arr[cnt].fpos = fpos;
            cnt++;

            if (strcmp(tmp1,tmp2)!=0)
            {
            arr[cnt].acc = newstring(tmp2);
            arr[cnt].fpos = fpos;
            cnt++;
            }

            if (!GetFilePos(fpos)) 
                  fprintf(stderr,"FastaFile::BuildSeqIndex() -- can't get file offset\n");
      }

      m_count = cnt;
      m_index = (void*)arr;

      qsort(m_index,m_count,sizeof(struct sidx),sidx_cmp);

      return m_count;
}



int FastaFile::GetSeqOffset (const char *acc, fpos_t &fpos)
{
      if (!acc) return 0;

      if (!m_index)
      {
            fprintf(stderr,"FastaFile::GetSeqOffset() -- no index (programmer error)\n");
            return 0;
      }

      sidx srec;
      srec.acc = (char*)acc;
      sidx *s = (struct sidx*) bsearch((void*)&srec,m_index,m_count,sizeof(struct sidx),sidx_cmp);
      if (s)
      {
            fpos = s->fpos;
            return 1;
      }
      return 0;
}



#define CHUNK 10000


bool FastaFile::Read (FastaSeq *&faseq)
{
      faseq = new FastaSeq();
      return Read(*faseq);
}


bool FastaFile::Read (FastaSeq &faseq)
{
      faseq.Clear();

      if (!IsOpen())
            return FALSE;

      char line[2000];

      if (!fgets(line, sizeof line, m_file))
            return FALSE;  

      if (line[0] != '>')
      {
            PrintError("FastaFile::Read();  ERROR: was expecting '>'");
            return FALSE;
      }

      int len = strlen(line);
      int buf_total = 1 + len + CHUNK;
      char *buf;

      if (!MemAlloc(buf,buf_total))
            return FALSE;
      
      strcpy(buf,line);
      int buf_used = len;
      char *p = buf+len;

      //long offset = ftell(m_file);
      fpos_t fpos;
      if (!GetFilePos(fpos))  fprintf(stderr,"screwed\n");

      while (fgets(line, sizeof line, m_file))
      {
            if (line[0] == '>')
            {
                  if (!SetFilePos(fpos)) { fprintf(stderr,"screwed\n"); }
                  //fseek(m_file,offset,SEEK_SET);
                  break;
            }
            len = strlen(line);
            if (buf_used +len +1 > buf_total)
            {
                  buf_total += CHUNK;
                  if (!MemResize(buf,buf_total))
                  {
                        PrintError("Out of memory");
                        MemDealloc(buf);
                        return FALSE;
                  }
                  p = buf + buf_used;
            }
            strcpy(p,line);
            buf_used += len;
            p += len;
            //offset = ftell(m_file);
            if (!GetFilePos(fpos))  fprintf(stderr,"screwed\n");
      }

      bool result = faseq.ParseText(buf,m_toupper,m_seqtype);

      MemDealloc(buf);
      return result;
}


int FastaFile::Write (FastaSeq &seq)
{
      if (!IsOpen())
            return 0;

      fprintf(m_file,">%s\n", seq.Defline());
      WriteSeqLines(m_file,seq.Sequence(),seq.Length());
      return 1;
}


int WriteSeqLines (FILE *fd, const char *seq, int len, int linelen)
{
      char line[1+SEQLINE_LEN_MAX], *p1, ch;
      const char *p2 =seq;
      int i;

      ///// force linelen to valid range
      if (linelen > SEQLINE_LEN_MAX)
            linelen = SEQLINE_LEN_MAX;
      else if (linelen < SEQLINE_LEN_MIN)
            linelen = SEQLINE_LEN_MIN;


      while (len >0)
      {
            p1 = line;
            for (i=0; i<len && i<linelen; ++i)
            {
                  if ((ch = *p2++) ==0)
                        break;
                  *p1++ = ch;
            }
            *p1 = 0;
            if (fprintf(fd,"%s\n",line) <0)
                  return 0;;
            len -= i;
      }
      return 1;
}



///////////////////////////////////////////////////////////////////////////////

char * ExtractSeqAcc (const char *seqid, char *acc)
{
      *acc = 0;

      const char *p;

      if (  (p=strstr(seqid,"lcl|")) ||
                  (p=strstr(seqid,"ref|")) ||
                  (p=strstr(seqid,"emb|")) ||
                  (p=strstr(seqid,"dbj|")) )
            p += 4;
      else if ( p=strstr(seqid,"gb|") )
            p += 3;
      else 
            return NULL;

      /*****
      const char *p = seqid;
      if (strncmp(p,"gi|",3)==0)  
      {
            if (p=strchr(p+3,'|')) p++;
            else return NULL;
      }
      else if (strncmp(p,"lcl|",4)==0 || strncmp(p,"ref|",4)==0)
      {
            p += 4;
      }
      *****/

      char *p2 = acc;
      while (*p && *p !='|')  *p2++ = *p++;
      *p2 =0;
      return acc;
}


char * ExtractSeqName (const char *seqid, char *name)
{
      *name = 0;

      const char *p = seqid;

      if (  (p=strstr(seqid,"lcl|")) ||
                  (p=strstr(seqid,"ref|")) ||
                  (p=strstr(seqid,"emb|")) ||
                  (p=strstr(seqid,"dbj|")) )
            p += 4;
      else if ( p=strstr(seqid,"gb|") )
            p += 3;
      else
      {
            strcpy(name,seqid);
            return name;
      }

      if (! (p = strchr(p,'|')) ) return NULL;
      p++;

      char *p2 = name;
      while (*p && *p != '|')  *p2++ = *p++;
      *p2 = 0;
      return name;
}


///////////////////////////////////////////////////////////////////////////////


/* initialize for computation of reverse complements */
static char compbase[128];

void init_revcomp()
{
      compbase['A'] = 'T';
      compbase['C'] = 'G';
      compbase['G'] = 'C';
      compbase['T'] = 'A';

      /* ambiguity code */
      compbase['B'] = 'V';    /* B = C, G or T */
      compbase['D'] = 'H';    /* D = A, G or T */
      compbase['H'] = 'D';    /* H = A, C or T */
      compbase['K'] = 'M';    /* K = G or T */
      compbase['M'] = 'K';    /* M = A or C */
      compbase['N'] = 'N';    /* N = A, C, G or T */
      compbase['R'] = 'Y';    /* R = A or G (purines) */
      compbase['S'] = 'S';    /* S = C or G */
      compbase['V'] = 'B';    /* V = A, C or G */
      compbase['W'] = 'W';    /* W = A or T */
      compbase['X'] = 'X';    /* X = A, C, G or T */
      compbase['Y'] = 'R';    /* Y = C or T (pyrimidines) */
}

/* form the reverse complement of a sequence */
void revcomp(char *s, int length)
{
      char *t, save;

      for (t = s + length - 1; s <= t; ++s, --t) {
            save = *s;
            *s = compbase[*t];
            *t = compbase[save];
      }
}




int seq_hash (long &hash, const char *seq, int len)
{
      if (seq==NULL) return 0;
      if (len <= 0) len = strlen(seq);

      const char *p = seq;

      char ch;
      unsigned long b, h = 0;

      int i;

      for (i=0; i<len; i++)
      {
            h <<= 2;                     // left-shift by 2 bits

            ch = *p++;                   // get the next base, convet to 2 bit code
            if (ch=='C')      b = 1;     // C -> 1  (binary 01)
            else if (ch=='A') b = 2;     // A -> 2  (binary 10)
            else if (ch=='G') b = 3;     // G -> 3  (binary 11)
            else ch = 0;                 // T -> 0  (binary 00) and any other character
      
            h |= b;                      // OR the 2-bit base code at positions 1 and 2

            b = (h & 0xC0000000) >> 30;  // extract bits 31 and 32
            h ^= b;                      // XOR those bits into positions 1 and 2
            h &= 0x3FFFFFFF;             // clear bits 31 and 32
      }

      
      hash |= 0x40000000;              // set bit 31 (so valid hash can never be zero)
      hash = (long)h;

      return 1;
}



/*
 * $Log: fasta-io.cpp,v $
 * Revision 1.1  2003/03/31 21:08:27  rotmistr
 * Imported to CVS
 * Compilable with gcc
 *
 */

Generated by  Doxygen 1.6.0   Back to index