File: sprot.c

package info (click to toggle)
squizz 0.99b%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 6,648 kB
  • ctags: 8,291
  • sloc: sh: 12,012; ansic: 2,607; lex: 1,944; yacc: 1,659; makefile: 119
file content (185 lines) | stat: -rw-r--r-- 4,489 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/* sprot.c - SWISSPROT sequence functions */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <sys/types.h>
#ifdef HAVE_INTTYPES_H
#include <inttypes.h>
#endif
#include <stdio.h>
#ifdef STDC_HEADERS
# include <string.h>
#endif

#include "sequence.h"
#include "sequence/protein.h"
#include "sequence/sprot.h"

typedef uint64_t crc_t;

#ifdef _LP64
#define CRC_CONST 0xd800000000000000UL
#define CRC_PRINT "lX"
#else
#define CRC_CONST 0xd800000000000000ULL
#define CRC_PRINT "llX"
#endif


/* Functions prototypes */
extern sequence_t *sproty_parse(FILE *);
extern int sproty_check(FILE *);
static crc_t sprot_crc64(char *);


/* Parse SPROT sequence */
sequence_t *sprot_parse(FILE *f) {
  sequence_t *seq;

  seq = sproty_parse(f);

  return seq; }


/* Check SPROT sequence */
int sprot_check(FILE *f) {
  int i;

  i = sproty_check(f);

  return i; }


/* Print SPROT sequence */
void sprot_print(FILE *f, sequence_t *seq) {
  char *p, *q, **x;
  const char *r, *s, *t;
  float w;
  int max;
  long l, sep;
  size_t len;
  crc_t crc;

  if (seq == NULL) { return; }

  /* Inits */
  max = 80 - 5;

  /* Name */
  r = (seq->nam != NULL) ? seq->nam : "UNKNOWN";
  s = "STANDARD"; t = "PRT"; len = seq->strlen;
  (void)fprintf(f, "ID   %-14.14s %s; %8s; %5lu AA.\n", r, s, t, len);

  /* Accession */
  /* FIXME: Sort secondary accessions alphabetically ... */
  if (seq->acc != NULL) {
    x = seq->acc; l = 0;
    while (x && *x) {
      switch (l) {
      case 0: s = "AC   "; break;
      case 5: s = ""; break;
      default: s = "; "; break; }
      len = strlen(*x); sep = strlen(s);
      if (l + sep + len + 1 > max) {
        (void)fprintf(f, ";\n"); l = 0; continue; }
      (void)fprintf(f, "%s%s", s, *x);
      l += sep + len; x++; }
    (void)fprintf(f, ";\n"); }

  /* Description */
  p = seq->dsc;
  while (p && *p) {
    while (*p && *p == ' ') { p++; }
    (void)fprintf(f, "DE   ");
    q = p;
    while (*q && q - p < max - 5) { q++; }
    while (/*CONSTCOND*/1) {
      /* Do not split words */
      if (q - p > 0 && *q && *q != ' ' && *q != '-') { q--; continue; }
      /* Do not split specials `--', `->' */
      if (*q == '-' && *(q+1) && *(q+1) == '-') { q--; continue; }
      if (*q == '-' && *(q+1) && *(q+1) == '>') { q--; continue; }
      /* Check for trailing '-', '.' */
      if (*q == '-' && q - p + 1 > max - 5) { q--; continue; }
      if (*q == '\0' && q - p + 1 > max - 5) { q--; continue; }
      /* Do not split Enzyme refs */
      if (*q == '-' && q - p > 1 && *(q-1) == '.') { q--; continue; }
      if ((*q == '\0' || *q == ' ') && q - p > 4 &&
	  strncmp(q-4, " (EC", 4) == 0) { q--; continue; }
      if (*q == '-') { q++; }
      break; }
    while (*p && q - p > 0) {
      (void)fputc(*p, f); p++; }
    if (*p == '\0') {
      (void)fputc('.', f); }
    (void)fputc('\n', f); }

  /* Keywords */
  if (seq->kwd != NULL) {
    x= seq->kwd; l = 0;
    while (x && *x) {
      switch (l) {
      case 0: s = "KW   "; break;
      case 5: s = ""; break;
      default: s = "; "; break; }
      len = strlen(*x); sep = strlen(s);
      if (l + sep + len + 1 > max) {
	(void)fprintf(f, ";\n"); l = 0; continue; }
      (void)fprintf(f, "%s%s", s, *x);
      l += sep + len; x++; }
    (void)fprintf(f, ".\n"); }

  /* Sequence header */
  len = seq->strlen; w = protein_weight(seq->str) + 0.5;
  crc = sprot_crc64(seq->str);
  (void)fprintf(f, "SQ   SEQUENCE   %lu AA;  %lu MW", len, (unsigned long)w);
  (void)fprintf(f, ";  %016" CRC_PRINT " CRC64;\n", crc);

  /* Sequence */
  p = seq->str; l = 0;
  while (*p) {
    l = p - seq->str;
    if (l % 60 == 0 && l > 0) {
      (void)fputc('\n', f); }
    if (l % 60 == 0) {
      (void)fprintf(f, "     "); }
    if (l % 10 == 0 && l % 60 != 0) {
      (void)fputc(' ', f); }
    if (*p == '*') { p++; continue; }
    (void)fputc(*p, f);
    p++; }
  (void)fputc('\n', f);

  /* End */
  (void)fprintf(f, "//\n");

  return; }


/* Calculate SwissProt CRC64 (x64 + x4 + x3 + x1 + 1) */
static crc_t sprot_crc64(char *str) {
  char *p;
  int i, j;
  crc_t crc, tab[256];

  /* Init table */
  for (i = 0; i < 256; i++) {
    crc_t part = i;
    for (j = 0; j < 8; j++) {
      if (part & 1) {
	part = (part >> 1) ^ CRC_CONST;
	continue; }
      part = part >> 1; }
    tab[i] = part; }

  /* Calculate CRC */
  crc = 0; p = str;
  while (*p) {
    crc_t tmp1, tmp2;
    tmp1 = crc >> 8; tmp2 = (crc ^ *p) & 0xff;
    i = (int)tmp2; crc = tmp1 ^ tab[i];
    p++; }

  return crc; }