File: streamcsv.go

package info (click to toggle)
golang-github-kshedden-dstream 0.0~git20190512.c4c4106-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 596 kB
  • sloc: makefile: 30
file content (458 lines) | stat: -rw-r--r-- 10,204 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
package dstream

import (
	"encoding/csv"
	"errors"
	"fmt"
	"io"
	"os"
	"time"
)

// CSVReader supports reading a Dstream from an io.Reader.
type CSVReader struct {

	// The reader for the file
	rdr io.Reader

	// The csv reader that parses the raw text
	csvrdr *csv.Reader

	// bdata holds the data
	bdata []interface{}

	// We need to read the first row to get the number of columns.
	// or the header.  We can then store it here so that it is
	// included in the first data chunk.
	firstrow []string

	// If true, skip records with unparseable CSV data, otherwise
	// panic on them.
	skipErrors bool

	// The record separator, passed to csv.Reader
	comma rune

	// The number of records to read at once
	chunkSize int

	// The number of variables
	nvar int

	// The number of observations, not known until reading is complete
	nobs int

	// If true, the first row of the data file is a header (contains column names)
	hasheader bool

	// doneInit becomes true after init is called
	doneinit bool

	// done becomes true after configuration is complete
	done bool

	// Index of current chunk
	chunknum int

	// If limitchunk > 0, read only this many chunks, otherwise read all chunks.
	limitchunk int

	// Map from variable names to column positions in the dstream
	namepos map[string]int

	// Map from variable names to column positions in the file
	filenamepos map[string]int

	// The positions in the file of the variables in the dstream, in dstream order
	filepos []int

	// The names of the dstream variables
	names []string

	// The variable name/type structs, provided by the caller
	types []VarType

	// The variable types
	dtypes []Dtype

	// A function for parsing time values
	parseTime func(string) time.Time
}

// VarType defines the name and type of one column of the CSV file.
type VarType struct {

	// The variable name
	Name string

	// The data type of the variable
	Type Dtype
}

// FromCSV returns a Dstream that reads from a CSV source.  Call at
// least one SetXX method to define variables to be retrieved.  For
// further configuration, chain calls to other SetXXX methods, and
// finally call Done to produce the Dstream.
func FromCSV(r io.Reader) *CSVReader {

	dr := &CSVReader{
		rdr: r,
	}

	return dr
}

// ParseTime sets the time parsing function./
func (cs *CSVReader) ParseTime(f func(string) time.Time) *CSVReader {
	cs.parseTime = f
	return cs
}

// Done is called when all configuration is complete.  After calling
// Done, the DStream can be used.
func (cs *CSVReader) Done() Dstream {
	cs.init()
	return cs
}

// SkipErrors results in lines with unpareable CSV content being
// skipped (the csv.ParseError is printed to stdio).
func (cs *CSVReader) SkipErrors() *CSVReader {
	cs.skipErrors = true
	return cs
}

// SetTypes species the types of the variables.  If the CSV file has a header,
// these values may appear in any order, and variables not included in types
// are omitted. If the CSV file does not have a header, then
// types must match the columns in the file, in the correct order.
func (cs *CSVReader) SetTypes(types []VarType) *CSVReader {
	cs.types = types
	return cs
}

// LimitChunk sets the number of chunks to read.
func (cs *CSVReader) LimitChunk(n int) *CSVReader {
	cs.limitchunk = n
	return cs
}

// Close does nothing and is implemented to satisfy the Dstream interface.
// If any io.Reader values passed to FromCSV need closing, they should be
// closed by the caller.
func (cs *CSVReader) Close() {
}

// HasHeader indicates that the first row of the data file contains
// column names.  The default behavior is that there is no header.
func (cs *CSVReader) HasHeader() *CSVReader {
	if cs.doneinit {
		msg := "FromCSV: can't call HasHeader after beginning data read"
		panic(msg)
	}
	cs.hasheader = true
	return cs
}

// Comma sets the delimiter (comma rune) for the CSVReader.  By default,
// the comma rune is a comma.
func (cs *CSVReader) Comma(c rune) *CSVReader {
	cs.comma = c
	return cs
}

// Consistency checks for arguments.
func (cs *CSVReader) checkArgs() {

}

func (cs *CSVReader) init() {

	cs.checkArgs()

	if cs.chunkSize == 0 {
		cs.chunkSize = 10000
	}

	cs.csvrdr = csv.NewReader(cs.rdr)
	if cs.comma != 0 {
		cs.csvrdr.Comma = cs.comma
	}

	// Read the first row (may or may not be column header)
	var firstrow []string
	var err error
	firstrow, err = cs.csvrdr.Read()
	if err != nil {
		panic(err)
	}

	if len(cs.types) == 0 {
		panic("SetTypes must be called.")
	}
	cs.nvar = len(cs.types)

	cs.filenamepos = make(map[string]int)
	if cs.hasheader {
		for pos, na := range firstrow {
			cs.filenamepos[na] = pos
		}
	} else {
		// Save the first row since it contains data
		cs.firstrow = firstrow

		if len(firstrow) != len(cs.types) {
			msg := fmt.Sprintf("File has %d columns and no header, but types has %d values.", len(firstrow), len(cs.types))
			panic(msg)
		}

		// If no header, types contains the names in the proper order
		for pos, vt := range cs.types {
			cs.filenamepos[vt.Name] = pos
		}
	}

	cs.setNames()
	cs.setbdata()

	cs.doneinit = true
}

func (cs *CSVReader) setNames() {

	// By the time this is called, dtypes is guaranteed to exist.
	cs.dtypes = make([]Dtype, len(cs.types))
	cs.names = make([]string, len(cs.types))
	cs.namepos = make(map[string]int)
	cs.filepos = make([]int, len(cs.types))
	for pos, vt := range cs.types {
		cs.names[pos] = vt.Name
		cs.dtypes[pos] = vt.Type
		cs.namepos[vt.Name] = pos
		cs.filepos[pos] = cs.filenamepos[vt.Name]
	}
}

// ChunkSize sets the size of chunks for this Dstream, it can only
// be called before reading begins.
func (cs *CSVReader) ChunkSize(c int) *CSVReader {
	cs.chunkSize = c
	return cs
}

// Names returns the names of the variables in the dstream.
func (cs *CSVReader) Names() []string {
	return cs.names
}

// NumVar returns the number of variables in the dstream.
func (cs *CSVReader) NumVar() int {
	return cs.nvar
}

// NumObs returns the number of observations in the dstream.  If the
// dstream has not been fully read, returns -1.
func (cs *CSVReader) NumObs() int {
	if cs.done {
		return cs.nobs
	}
	return -1
}

// GetPos returns a chunk of a data column by column position.
func (cs *CSVReader) GetPos(j int) interface{} {
	return cs.bdata[j]
}

// Get returns a chunk of a data column by name.
func (cs *CSVReader) Get(na string) interface{} {
	pos, ok := cs.namepos[na]
	if !ok {
		msg := fmt.Sprintf("Variable '%s' not found", na)
		panic(msg)
	}
	return cs.bdata[pos]
}

// Reset attempts to reset the Dstream that is reading from an
// io.Reader.  This is only possible if the underlying reader is
// seekable, so reset panics if the seek cannot be performed.
func (cs *CSVReader) Reset() {
	if !cs.doneinit {
		panic("cannot reset, Dstream has not been fully constructed")
	}

	if cs.nobs == 0 {
		return
	}

	r, ok := cs.rdr.(io.ReadSeeker)
	if !ok {
		panic("cannot reset")
	}
	_, err := r.Seek(0, io.SeekStart)
	if err != nil {
		panic(err)
	}
	cs.nobs = 0
	cs.done = false
	cs.chunknum = 0
	cs.rdr = r                   // is this needed?
	cs.csvrdr = csv.NewReader(r) // is this needed?

	// Skip over the header if needed.
	if cs.hasheader {
		_, err := cs.csvrdr.Read()
		if err != nil {
			panic(err)
		}
	}
}

// CSVWriter supports writing a Dstream to an io.Writer in csv format.
type CSVWriter struct {

	// The Dstream to be written.
	stream Dstream

	// Format for float type value
	floatFmt string

	// A slice of format types, stored per-variable.
	fmts []string

	wtr io.Writer
}

// ToCSV writes a Dstream in CSV format.  Call SetWriter or Filename
// to configure the underlying writer, then call additional methods
// for customization as desired, and finally call Done to complete the
// writing.
func ToCSV(d Dstream) *CSVWriter {
	c := &CSVWriter{
		stream: d,
	}
	return c
}

// FloatFmt sets the format string to be used when writing float
// values.  This value is ignored for columns specified in a call to
// the Formats method.
func (dw *CSVWriter) FloatFmt(fmt string) *CSVWriter {

	dw.floatFmt = fmt
	return dw
}

// Formats sets format strings to be used when writing the Dstream.
// The provided argument is a map from variable names to variable
// formats.
func (dw *CSVWriter) Formats(fmts map[string]string) *CSVWriter {

	vp := VarPos(dw.stream)

	if dw.fmts == nil {
		nvar := dw.stream.NumVar()
		dw.fmts = make([]string, nvar)
	}
	for v, f := range fmts {
		pos, ok := vp[v]
		if !ok {
			msg := fmt.Sprintf("ToCSV: column %s not found", v)
			panic(msg)
		}
		dw.fmts[pos] = f
	}

	return dw
}

// Filename configures the CSVWriter to write to the given named file.
func (dw *CSVWriter) Filename(name string) *CSVWriter {

	var err error
	dw.wtr, err = os.Create(name)
	if err != nil {
		panic(err)
	}

	return dw
}

// SetWriter configures the CSVWriter to write to the given io stream.
func (dw *CSVWriter) SetWriter(w io.Writer) *CSVWriter {

	dw.wtr = w
	return dw
}

// getFmt is a utility for getting the format string for a given
// column.
func (dw *CSVWriter) getFmt(t string, col int) string {

	if dw.fmts != nil && dw.fmts[col] != "" {
		return dw.fmts[col]
	}

	switch t {
	case "float":
		if dw.floatFmt == "" {
			return "%.8f"
		}
		return dw.floatFmt
	case "int":
		return "%d"
	default:
		panic("unknown type")
	}
}

// Done completes writing a Dstream to a specified io.Writer in csv
// format.
func (dw *CSVWriter) Done() error {

	if dw.wtr == nil {
		return errors.New("ToCSV: writer must be set before calling Done")
	}

	csw := csv.NewWriter(dw.wtr)

	err := csw.Write(dw.stream.Names())
	if err != nil {
		return err
	}

	nvar := dw.stream.NumVar()
	rec := make([]string, nvar)
	fmts := make([]string, nvar)

	firstrow := true
	for dw.stream.Next() {
		n := ilen(dw.stream.GetPos(0))

		for i := 0; i < n; i++ {
			for j := 0; j < nvar; j++ {
				// TODO: better support for types
				switch x := dw.stream.GetPos(j).(type) {
				case []float64:
					if firstrow {
						fmts[j] = dw.getFmt("float", j)
					}
					rec[j] = fmt.Sprintf(fmts[j], x[i])
				case []string:
					rec[j] = x[i]
				default:
					rec[j] = ""
				}
			}
			if err := csw.Write(rec); err != nil {
				return err
			}
			firstrow = false
		}
	}

	csw.Flush()

	return nil
}