File: io.py

package info (click to toggle)
python-pomegranate 0.15.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 36,948 kB
  • sloc: python: 11,489; makefile: 259; sh: 28
file content (454 lines) | stat: -rw-r--r-- 12,415 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
import numpy

try:
	import pandas
except:
	pandas = None

class BaseGenerator(object):
	"""The base data generator class.

	This object is inherited by data generator objects in order to specify that
	they are data generators. Do not use this object directly.
	"""
	
	def __init__(self):
		pass

	def __len__(self):
		return NotImplementedError

	@property
	def shape(self):
		return NotImplementedError

	@property
	def classes(self):
		return NotImplementedError

	@property
	def ndim(self):
		return NotImplementedError

class DataGenerator(BaseGenerator):
	"""A generator that returns batches of a data set.

	This object will wrap a data set and optionally a set of labels and will
	return batches of data as requested. When it reaches the end of a data
	set it will not roll over but rather return a batch of data smaller
	than the other batches.

	Parameters
	----------
	X : numpy.ndarray or list
		The data set to iterate over.

	weights : numpy.ndarray or list or None, optional
		The weights for each example. Default is None.

	y: numpy.ndarray or list or None, optional
		The set of labels for each example in the data set. Default is None.

	batch_size : int or None, optional
		The size of the batches to return. If None will return the full data 
		set each time. Default is None

	batches_per_epoch : int or None, optional
		The number of batches to return before resetting the index. If the
		value is too low you may not see all examples from the data set. If
		None, will return enough batches to cover the entire data set. Default
		is None.
	"""
	
	def __init__(self, X, weights=None, y=None, batch_size=None,
			batches_per_epoch=None):
		self.X = numpy.array(X)
		self.y = y
		self.idx = 0

		if y is not None and len(y) != len(X):
			raise ValueError("Size of label vector y does not match size of data.")

		if weights is None:
			self.weights = numpy.ones(len(X), dtype='float64')
		else:
			if len(weights) != len(X):
				raise ValueError("Size of weight vector does not match size of data.")

			self.weights = numpy.array(weights)

		if batch_size is None:
			self.batch_size = len(self)
		else:
			self.batch_size = int(batch_size)

		if batches_per_epoch is None:
			self.batches_per_epoch = float("inf")
		else:
			self.batches_per_epoch = batches_per_epoch

	def __len__(self):
		return len(self.X)

	@property
	def shape(self):
		return self.X.shape

	@property
	def classes(self):
		if self.y is None:
			raise ValueError("Classes cannot be found on an unlabeled data set.")

		return numpy.unique(self.y)

	@property
	def ndim(self):
		return self.X.ndim

	def batches(self):
		if self.batch_size == len(self):
			while True:
				if self.y is not None:
					yield self.X, self.y, self.weights
				else:
					yield self.X, self.weights			
				break
		else:
			start, end = 0, self.batch_size
			iteration = 0

			while start < len(self) and iteration < self.batches_per_epoch:
				if self.y is not None:
					yield (self.X[start:end], self.y[start:end], 
						self.weights[start:end])
				else:
					yield self.X[start:end], self.weights[start:end]

				start += self.batch_size
				end += self.batch_size
				iteration += 1

	def labeled_batches(self):
		X = self.X[self.y != -1]
		weights = self.weights[self.y != -1]
		y = self.y[self.y != -1]

		start, end = 0, self.batch_size
		while start < len(X):
			yield X[start:end], y[start:end], weights[start:end]
			
			start += self.batch_size
			end += self.batch_size 

	def unlabeled_batches(self):
		X = self.X[self.y == -1]
		weights = self.weights[self.y == -1]

		start, end = 0, self.batch_size
		while start < len(X):
			yield X[start:end], weights[start:end]
			
			start += self.batch_size
			end += self.batch_size

class SequenceGenerator(BaseGenerator):
	"""A generator that returns batches of sequences from a data set.

	This object will wrap a data set and optionally a set of labels and will
	return sequences as requested. Due to the processing in pomegranate, only
	batches of size 1 are supported.

	Parameters
	----------
	X : numpy.ndarray or list
		The data set to iterate over.

	weights : numpy.ndarray or list or None, optional
		The weights for each example. Default is None.

	y: numpy.ndarray or list or None, optional
		The set of labels for each example in the data set. Default is None.
	"""
	
	def __init__(self, X, weights=None, y=None, batches_per_epoch=None):
		self.X = X
		self.y = y
		self.idx = 0

		if weights is None:
			self.weights = numpy.ones(len(X), dtype='float64')
		else:
			self.weights = weights

		if batches_per_epoch is None:
			self.batches_per_epoch = float("inf")
		else:
			self.batches_per_epoch = batches_per_epoch

	def __len__(self):
		return len(self.X)

	@property
	def shape(self):
		x_ = numpy.asarray(self.X[0])

		if x_.ndim == 1:
			return len(self.X), 1
		elif x_.ndim == 2:
			return len(self.X), x_.shape[1]
		else:
			raise ValueError("Data must be passed in as a list of numpy arrays.")

	@property
	def ndim(self):
		return len(self.X[0])

	@property
	def classes(self):
		if self.y is None:
			raise ValueError("No labels found for this data set.")

		return numpy.unique(self.y)

	def batches(self):
		for idx in range(len(self)):
			if self.y is not None:
				yield self.X[idx:idx+1], self.weights[idx:idx+1], self.y[idx:idx+1]
			else:
				yield self.X[idx:idx+1], self.weights[idx:idx+1]

	def labeled_batches(self):
		X = [x for x, y in zip(self.X, self.y) if y is not None]
		weights = [w for w, y in zip(self.weights, self.y) if y is not None]
		y = [y for y in self.y if y is not None]

		for idx in range(len(X)):
			yield X[idx:idx+1], weights[idx:idx+1], y[idx:idx+1]

	def unlabeled_batches(self):
		X = [x for x, y in zip(self.X, self.y) if y is None]
		weights = [w for w, y in zip(self.weights, self.y) if y is None]

		for idx in range(len(X)):
			yield X[idx:idx+1], weights[idx:idx+1]

class DataFrameGenerator(BaseGenerator):
	"""A generator that returns batches of sequences from a DataFrame.

	This object will wrap a DataFrame and generate batches of data
	from it. This gives a natural support for pandas DataFrames to
	all pomegranate models. Weights and labels for examples can either
	be passed in separately (potentially either as Series or array
	objects) or specified as a column in the main DataFrame object.

	Parameters
	----------
	X : pandas.DataFrame
		The DataFrame containing data. The weights and labels (if used)
		can be columns in this DataFrame.

	weights : pandas.Series, numpy.ndarray, str, or None, optional
		The weights for each example. This can either be a series, a numpy
		array, a string key for the column of the DataFrame, or None. None
		means that all weights are uniform. Default is None.

	y : pandas.Series, numpy.ndarray, str, or None, optional
		The labels for each example. This can either be a series, a numpy
		array, a string key for the column of the DataFrame, or None. None
		means that all examples are unlabeled. Default is None.

	batch_size : int or None, optional
		The size of the batches to return. If None will return the full data 
		set each time. Default is None

	batches_per_epoch : int or None, optional
		The number of batches to return before resetting the index. If the
		value is too low you may not see all examples from the data set. If
		None, will return enough batches to cover the entire data set. Default
		is None.
	"""
	
	def __init__(self, X, weights=None, y=None, batch_size=None, 
		batches_per_epoch=None):
		if pandas is None:
			raise ValueError("Must have pandas installed to use DataFrameGenerator.")

		self.X = X

		if isinstance(weights, pandas.Series):
			self.weights = weights.values
		elif isinstance(weights, numpy.ndarray):
			self.weights = weights
		elif isinstance(weights, str):
			self.weights = self.X[weights].values
			self.X = self.X.drop(weights, axis=1)
		else:
			self.weights = numpy.ones(self.X.shape[0])

		if isinstance(y, pandas.Series):
			self.y = y.values
		elif isinstance(y, numpy.ndarray):
			self.y = y
		elif isinstance(y, str):
			self.y = self.X[y].values
			self.X = self.X.drop(y, axis=1)
		else:
			self.y = None

		if len(self.weights) != len(self.X):
			raise ValueError("Size of weights does not match size of data set.")
		if self.y is not None and len(self.y) != len(self.X):
			raise ValueError("Size of labels does not match size of data set.")

		self.batch_size = batch_size or len(self.X)

		if batches_per_epoch is None:
			self.batches_per_epoch = float("inf")
		else:
			self.batches_per_epoch = batches_per_epoch

	def __len__(self):
		return len(self.X)

	@property
	def shape(self):
		return self.X.shape

	@property
	def classes(self):
		if self.y is None:
			raise ValueError("Must specify y to return classes.")

		return numpy.unique(self.y)

	@property
	def ndim(self):
		return self.X.ndim

	def batches(self):
		if self.batch_size == len(self):
			while True:
				if self.y is not None:
					yield self.X.values, self.y, self.weights
				else:
					yield self.X.values, self.weights			
				break
		else:
			start, end = 0, self.batch_size
			iteration = 0

			while start < len(self) and iteration < self.batches_per_epoch:
				x = self.X.iloc[start:end].values

				if self.y is not None:
					yield x, self.y[start:end], self.weights[start:end]
				else:
					yield x, self.weights[start:end]

				start += self.batch_size
				end += self.batch_size
				iteration += 1

	def labeled_batches(self):
		X = self.X.iloc[self.y != -1]
		weights = self.weights[self.y != -1]
		y = self.y[self.y != -1]

		start, end = 0, self.batch_size
		while start < len(X):
			x_ = X.iloc[start:end].values
			yield x_, y[start:end], weights[start:end]
			
			start += self.batch_size
			end += self.batch_size 

	def unlabeled_batches(self):
		X = self.X.iloc[self.y == -1]
		weights = self.weights[self.y == -1]

		start, end = 0, self.batch_size
		while start < len(X):
			x_ = X.iloc[start:end].values
			yield x_, weights[start:end]
			
			start += self.batch_size
			end += self.batch_size

class CSVGenerator(BaseGenerator):
	"""A generator that returns batches of sequences from a data file.

	This object will wrap a file, such as a CSV file, and generate batches
	of data from it. It will not load the entire file into memory except
	for particular model methods that force it to do so. It is mostly a
	wrapper around a call to `pandas.read_csv`.

	Parameters
	----------
	filename : str
		The name of the file to open.

	weight_column : str or int or None, optional 
		The column to use for the weights. If None, assume uniform weights.

	y_column: str or int or None, optional
		The column to use for the labels. If None, assume no labels.

	kwargs : keyword arguments, optional
		Any other argument to pass into `pandas.read_csv`.
	"""
	
	def __init__(self, filename, weight_column=None, y_column=None, 
		batch_size=32, **kwargs):
		self.filename = filename
		self.weight_column = weight_column
		self.y_column = y_column
		self.kwargs = kwargs
		self.file = pandas.read_csv(filename, iterator=True, 
			chunksize=batch_size, **kwargs)

	def __len__(self):
		return len(self.X)

	@property
	def shape(self):
		raise ValueError("Cannot get shape of a file.")

	@property
	def classes(self):
		if self.y_column is None:
			raise ValueError("Must specify y_column to return classes.")

		return numpy.unique(numpy.concatenate([numpy.unique(
			batch[self.y_column]) for batch in self.batches]))

	def batches(self):
		for batch in self.file:
			if self.weight_column is not None:
				weights = batch[self.weight_column].values.astype('float64')
				X = batch.drop(self.weight_column)
			else:
				weights = numpy.ones(batch.shape[0], dtype='float64')

			if self.y_column is not None:
				y = X[self.y_column].values.astype('float64')
				X = X.drop(self.y_column).values.astype('float64')
				yield X, weights, y
			else:
				X = X.values.astype('float64')
				yield X, weights


	def labeled_batches(self):
		X = [x for x, y in zip(self.X, self.y) if y is not None]
		weights = [w for w, y in zip(self.weights, self.y) if y is not None]
		y = [y for y in self.y if y is not None]

		for idx in range(len(X)):
			yield X[idx:idx+1], weights[idx:idx+1], y[idx:idx+1]

	def unlabeled_batches(self):
		X = [x for x, y in zip(self.X, self.y) if y is None]
		weights = [w for w, y in zip(self.weights, self.y) if y is None]

		for idx in range(len(X)):
			yield X[idx:idx+1], weights[idx:idx+1]