1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
|
/******************************************************************************************
*
* File: ONElib.h
* Header for ONE file reading and writing
*
* Authors: Richard Durbin (rd109@cam.ac.uk), Gene Myers (myers@mpi-cbg.de)
* Copyright (C) Richard Durbin, Gene Myers, 2019-
*
* HISTORY:
* Last edited: Dec 3 06:08 2022 (rd109)
* * Dec 3 06:01 2022 (rd109): remove oneWriteHeader(), switch to stdarg for oneWriteComment etc.
* * Dec 27 09:46 2019 (gene): style edits
* * Created: Sat Feb 23 10:12:43 2019 (rd109)
*
*****************************************************************************************/
#ifndef ONE_DEFINED
#define ONE_DEFINED
#include <stdio.h> // for FILE etc.
#include <stdarg.h> // for formatted writing in oneWriteComment(), oneAddProvenance()
#include <inttypes.h> // for standard size int types and their PRI print macros
#include <stdbool.h> // for standard bool types
#include <limits.h> // for INT_MAX etc.
#include <pthread.h>
/***********************************************************************************
*
* DATA TYPES
*
**********************************************************************************/
// Basic Types
#ifndef U8_DEFINED
#define U8_DEFINED
typedef int8_t I8;
typedef int16_t I16;
typedef int32_t I32;
typedef int64_t I64;
typedef unsigned char U8;
#endif // U8_DEFINED
typedef enum { oneINT = 1, oneREAL, oneCHAR, oneSTRING,
oneINT_LIST, oneREAL_LIST, oneSTRING_LIST, oneDNA } OneType;
extern char* oneTypeString[] ;
// = { 0, "INT", "REAL", "CHAR", "STRING", "INT_LIST", "REAL_LIST", "STRING_LIST", "DNA" } ;
typedef union
{ I64 i;
double r;
char c;
I64 len; // For lists : top 8 bits encode excess bytes, low 56 length
} OneField;
typedef struct
{ char *program;
char *version;
char *command;
char *date;
} OneProvenance;
typedef struct
{ char *filename;
I64 count;
} OneReference;
typedef struct
{ I64 count;
I64 max;
I64 total;
I64 groupCount;
I64 groupTotal;
} OneCounts;
// OneCodecs are a private package for binary one file compression
typedef void OneCodec; // forward declaration of opaque type for compression codecs
// DNAcodec is a special pre-existing compressor one should use for DNA.
// It compresses every base to 2-bits, where any non-ACGT letter is
// effectively converted to an A. Compression is case insensitive,
// but decompression always delivers lower-case.
extern OneCodec *DNAcodec;
// Record for a particular line type. There is at most one list element.
typedef struct
{ OneCounts accum; // counts read or written to this moment
OneCounts given; // counts read from header
I64 gCount; // used internally to calculate groupCount and groupTotal
I64 gTotal;
I64 oCount; // # of objects in prefix before first group (if any)
I64 oTotal; // + of objects in prefix (these 2 are for thread parallel apps)
int nField; // number of fields
OneType *fieldType; // type of each field
int listEltSize; // size of list field elements (if present, else 0)
int listField; // field index of list
char *comment; // the comment on the definition line in the schema
bool isUserBuf; // flag for whether buffer is owned by user
I64 bufSize; // system buffer and size if not user supplied
void *buffer;
OneCodec *listCodec; // compression codec and flags
bool isUseListCodec; // on once enough data collected to train associated codec
char binaryTypePack; // binary code for line type, bit 8 set.
// bit 0: list compressed
I64 listTack; // accumulated training data for this threads codeCodec (master)
} OneInfo;
// the schema type - the first record is the header spec, then a linked list of primary classes
typedef struct OneSchema
{
char *primary ;
int nSecondary ;
char **secondary ;
OneInfo *info[128] ;
int nFieldMax ;
char objectType ;
char groupType ;
struct OneSchema *nxt ;
} OneSchema ;
typedef struct OneHeaderText
{ char *text ;
struct OneHeaderText *nxt ;
} OneHeaderText ;
// The main OneFile type - this is the primary handle used by the end user
typedef struct
{
// this field may be set by the user
bool isCheckString; // set if want to validate string char by char
// these fields may be read by user - but don't change them!
char *fileType;
char *subType;
char lineType; // current lineType
char objectType; // line designation character for primary objects
char groupType; // line designation character for groups (optional)
I64 line; // current line number
I64 byte; // current byte position when writing binary
I64 object; // current object - incremented when object line read
I64 group; // current group - incremented when group line read
OneProvenance *provenance; // if non-zero then count['!'] entries
OneReference *reference; // if non-zero then count['<'] entries
OneReference *deferred; // if non-zero then count['>'] entries
OneField *field; // used to hold the current line - accessed by macros
OneInfo *info[128]; // all the per-linetype information
I64 codecTrainingSize; // amount of data to see before building codec
// fields below here are private to the package
FILE *f;
bool isWrite; // true if open for writing
bool isHeaderOut; // true if header already written
bool isBinary; // true if writing a binary file
bool inGroup; // set once inside a group
bool isLastLineBinary; // needed to deal with newlines on ascii files
bool isIndexIn; // index read in
bool isBig; // are we on a big-endian machine?
bool isNoAsciiHeader; // backdoor for ONEview to avoid writing header in ascii
char lineBuf[128]; // working buffers
char numberBuf[32];
int nFieldMax;
I64 codecBufSize;
char *codecBuf;
I64 nBits; // number of bits of list currently in codecBuf
I64 intListBytes; // number of bytes per integer in the compacted INT_LIST
I64 linePos; // current line position
OneHeaderText *headerText; // arbitrary descriptive text that goes with the header
char binaryTypeUnpack[256]; // invert binary line code to ASCII line character.
int share; // index if slave of threaded write, +nthreads > 0 if master
int isFinal; // oneFinalizeCounts has been called on file
pthread_mutex_t fieldLock; // Mutexs to protect training accumumulation stats when threadded
pthread_mutex_t listLock;
} OneFile; // the footer will be in the concatenated result.
/***********************************************************************************
*
* ROUTINES FOR READING & WRITING ONE FILES IN BOTH ASCII & BINARY (TRANSPARENTLY)
*
**********************************************************************************/
// CREATING AND DESTROYING SCHEMAS
OneSchema *oneSchemaCreateFromFile (char *path) ;
OneSchema *oneSchemaCreateFromText (char *text) ;
// These functions create a schema handle that can be used to open One-code data files
// for reading and writing. A schema file is itself a One-code file, consisting of
// a set of objects, one per primary file type. Valid lines in this file are:
// P <primary file type> // a short string
// S <secondary file type> // a short string - any number of these
// O <char> <field_list> // definition of object type
// G <char> <field_list> // definition of group type - first field must be an int
// D <char> <field_list> // definition of line
// <char> must be a lower or upper case letter.
// <field_list> is a list of field types from:
// CHAR, INT, REAL, STRING, INT_LIST, REAL_LIST, STRING_LIST, DNA
// Only one list type (STRING, *_LIST or DNA) is allowed per line type.
// All the D lines following an O line apply to that object type.
// By convention comments on each line explain the definition.
// Example, with lists and strings preceded by their length in OneCode style
// P 3 seq this is a sequence file
// O S 1 3 DNA the DNA sequence - each S line starts an object
// D Q 1 6 STRING the phred encoded quality score + ASCII 33
// D N 4 4 REAL 4 REAL 4 REAL 4 REAL signal to noise ratio in A, C, G, T channels
// G g 2 3 INT 6 STRING group designator: number of objects, name
// The ...FromText() alternative writes the text to a temp file and reads it with
// oneSchemaCreateFromFile(). This allows code to set the schema.
// Internally a schema is a linked list of OneSchema objects, with the first holding
// the (hard-coded) schema for the header and footer, and the remainder each
// corresponding to one primary file type.
void oneSchemaDestroy (OneSchema *schema) ;
// READING ONE FILES:
OneFile *oneFileOpenRead (const char *path, OneSchema *schema, char *type, int nthreads) ;
// Open ONE file 'path', either binary or ascii encoded, for reading.
// If the file doesn't have a header, then 'type' must be specified,
// otherwise, if 'type' is non-zero it must match the header type.
// All header information (if present) is read.
// 'schema' is also optional. If it is NULL then the file must contain its own schema.
// If 'schema' is present then it must support 'type', and if the file contains its
// own schema, then that must be a subset of the one for this type in 'schema'.
// If nthreads > 1 then nthreadds OneFiles are generated as an array and the pointer
// to the first, called the master, is returned. The other nthreads-1 files are
// called slaves. The package routines are aware of when a OneFile argument is a
// slave or master in a parallel group. The master recieves provenance, counts, etc.
// The slaves only read data and have the virtue of sharing indices and codecs with
// the master if relevant.
bool oneFileCheckSchema (OneFile *vf, char *textSchema) ;
// Checks if file schema is consistent with text schema. Mismatches are reported to stderr.
// Filetype and all linetypes in text must match. File schema can contain additional linetypes.
// e.g. if (! oneFileCheckSchema (vf, "P 3 seq\nD S 1 3 DNA\nD Q 1 6 STRING\nD P 0\n")) die () ;
// This is provided to enable a program to ensure that its assumptions about data layout
// are satisfied.
char oneReadLine (OneFile *vf) ;
// Read the next ONE formatted line returning the line type of the line, or 0
// if at the end of the data section. The content macros immediately below are
// used to access the information of the line most recently read.
void *_oneList (OneFile *vf) ; // lazy codec decompression if required
void *_oneCompressedList (OneFile *vf) ; // lazy codec compression if required
#define oneInt(vf,x) ((vf)->field[x].i)
#define oneReal(vf,x) ((vf)->field[x].r)
#define oneChar(vf,x) ((vf)->field[x].c)
#define _LF(vf) ((vf)->info[(int)(vf)->lineType]->listField)
#define oneLen(vf) ((vf)->field[_LF(vf)].len & 0xffffffffffffffll)
#define oneString(vf) (char *) _oneList(vf)
#define oneDNAchar(vf) (char *) _oneList(vf)
#define oneDNA2bit(vf) (U8 *) _oneCompressedList(vf)
#define oneIntList(vf) (I64 *) _oneList(vf)
#define oneRealList(vf) (double *) _oneList(vf)
#define oneNextString(vf,s) (s + strlen(s) + 1)
// Access field information. The index x of a list object is not required as there is
// only one list per line, stored in ->buffer.
// A "string list" is implicitly supported, get the first string with oneString, and
// subsequent strings sequentially with oneNextString, e.g.:
//
// char *s = oneString(vf);
// for (i = 0; i < oneLen(vf); i++)
// { // do something with i'th string
// s = oneNextString(vf,s);
// }
char *oneReadComment (OneFile *vf);
// Can be called after oneReadLine() to read any optional comment text after the fixed fields.
// Returns NULL if there is no comment.
// WRITING ONE FILES:
OneFile *oneFileOpenWriteNew (const char *path, OneSchema *schema, char *type,
bool isBinary, int nthreads);
OneFile *oneFileOpenWriteFrom (const char *path, OneFile *vfIn,
bool isBinary, int nthreads);
// Create a new oneFile that will be written to 'path'. For the 'New' variant supply
// the file type, subtype (if non-zero), and whether it should be binary or ASCII.
// For the 'From' variant, specify binary or ASCII, schema and all other header
// information is inherited from 'vfIn', where the count stats are from vfIn's
// accumulation (assumes vfIn has been fully read or written) if 'useAccum is true,
// and from vfIn's header otherwise.
// If nthreads > 1 then nthreads OneFiles are generated as an array and the pointer
// to the first, called the master, is returned. The other nthreads-1 files are
// called slaves. The package routines are aware of when a OneFile argument is a
// slave or master in a parallel group. The slaves are expected to only write data
// lines, with the master adding provenance, producing the header, and then some
// segment of the initial data lines. Upon close the final result is effectively
// the concatenation of the master, followed by the output of each slave in sequence.
bool oneInheritProvenance (OneFile *vf, OneFile *source);
bool oneInheritReference (OneFile *vf, OneFile *source);
bool oneInheritDeferred (OneFile *vf, OneFile *source);
// Add all provenance/reference/deferred entries in source to header of vf. Must be
// called before first call to oneWriteLine.
bool oneAddProvenance (OneFile *vf, char *prog, char *version, char *format, ...);
bool oneAddReference (OneFile *vf, char *filename, I64 count);
bool oneAddDeferred (OneFile *vf, char *filename);
// Append provenance/reference/deferred to header information. Must be called before
// first call to oneWriteLine.
// For ASCII output, if you want the header to contain count information then you must
// create and fill the relevant OneCounts objects before the first call to oneWriteLine.
// For BINARY output, the OneCounts information is accumulated and written automatically.
void oneWriteLine (OneFile *vf, char lineType, I64 listLen, void *listBuf);
// Set up a line for output just as it would be returned by oneReadLine and then call
// this routine to output the line (ASCII or binary).
// Use the macros above on the l.h.s. of assignments to fill fields (e.g. oneInt(vf,2) = 3).
// For lists, give the length in the listLen argument, and either place the list data in your
// own buffer and give it as listBuf, or put in the line's buffer and set listBuf == NULL.
void oneWriteLineFrom (OneFile *vf, OneFile *source) ; // copies a line from source into vf
void oneWriteLineDNA2bit (OneFile *vf, char lineType, I64 listLen, U8 *dnaBuf);
// Minor variants of oneWriteLine().
// Use oneWriteLineDNA2bit for DNA lists if your DNA is already 2-bit compressed.
void oneWriteComment (OneFile *vf, char *format, ...); // can not include newline \n chars
// Adds a comment to the current line. Extends line in ascii, adds special line type in binary.
// CLOSING FILES (FOR BOTH READ & WRITE)
void oneFileClose (OneFile *vf);
// Close vf (opened either for reading or writing). Finalizes counts, merges theaded files,
// and writes footer if binary. Frees all non-user memory associated with vf.
// GOTO & BUFFER MANAGEMENT
void oneUserBuffer (OneFile *vf, char lineType, void *buffer);
// A buffer is used to capture the list element of each line type that has one.
// This routine allows you to reassign the buffer to one you've allocated, or
// to revert to a default system buffer if 'buffer' = NULL. The previous buffer
// (if any) is freed. The user must ensure that a buffer they supply is large
// enough. BTW, this buffer is overwritten with each new line read of the given type.
bool oneGotoObject (OneFile *vf, I64 i);
// Goto i'th object in the file. This only works on binary files, which have an index.
I64 oneGotoGroup (OneFile *vf, I64 i);
// Goto the first object in group i. Return the size (in objects) of the group, or 0
// if an error (i out of range or vf has not group type). Only works for binary files.
/***********************************************************************************
*
* A BIT ABOUT THE FORMAT OF BINARY FILES
*
**********************************************************************************/
// <bin file> <- <ASCII Prolog> <$-line> <binary data> <footer> <^-line> <footer-size:int64>
//
// '$'-line flags file is binary and gives endian
// The data block ends with a blank line consisting of '\n'
//
// EWM: Removed '-' line, simply write off_t to footer start
//
// <ASCII Prolog> <- <'1'-line> [<'2'-line>] ( <'!'-line> | <'<'-line> | <'>'-line> )*
//
// The ASCII prolog contains the type, subtype, provenance, reference, and deferred lines
// in the ASCII format. The ONE count statistic lines for each data line type are found
// in the footer along with binary ';' and ':' lines that encode their compressors as
// needed. The footer also contains binary '&' and '*' lines that encode the object index
// and group indices, respectively.
//
// <Binary line> <- <Binary line code + tags> <fields> [<list data>]
//
// Line codes are >= 128 for binary encoded lines. The low two order bits of these are flags,
// so each binary-encoded line type has 4 codes and a table maps these to the ASCII code.
// Bit 0 indicates if the fields of the line type are compressed, and Bit 1 indicates if
// the list data (if present) is compressed.
//
// If a field is a list, then the field array element for that field is the list's length
// where the low 56 bits encode length, and the high 8 bits encode the # of high-order
// 0-bytes in every list element if an INT_LIST (0 otherwise).
#endif // ONE_DEFINED
/******************* end of file **************/
|