1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
|
# ----------------------------------------------------------------------------
# Copyright (c) 2017-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import tarfile
import json
import qiime2.plugin.model as model
from qiime2.plugin import ValidationError
def _validate_record_len(cells, current_line_number, exp_len):
if len(cells) != exp_len:
raise ValidationError(
"Expected data record to be TSV with {0} "
"fields. Detected {1} fields at line {2}:\n\n{3!r}"
.format(exp_len, len(cells), current_line_number, cells))
def _validate_file_not_empty(has_data):
if not has_data:
raise ValidationError(
"There must be at least one data record present in the "
"file in addition to the header line.")
class BooleanSeriesFormat(model.TextFileFormat):
def _validate_(self, level):
n_records = {'min': 5, 'max': None}[level]
with self.open() as fh:
# validate header
# for now we will not validate any information in the header.
line = fh.readline()
# validate body
has_data = False
for line_number, line in enumerate(fh, start=2):
cells = line.strip().split('\t')
_validate_record_len(cells, line_number, 2)
if str(cells[1]) not in ('True', 'False'):
raise ValidationError(
"Expected data to be comprised of values `True` and "
"`False`, found {0} at line {1}."
.format(str(cells[1]), line_number))
has_data = True
if n_records is not None and (line_number - 1) >= n_records:
break
_validate_file_not_empty(has_data)
BooleanSeriesDirectoryFormat = model.SingleFileDirectoryFormat(
'BooleanSeriesDirectoryFormat', 'outliers.tsv',
BooleanSeriesFormat)
# This is effectively an internal format - it isn't registered with the
# plugin, but rather used as part of a dir fmt. This format also exists
# in q2-feature-classifier.
class PickleFormat(model.BinaryFileFormat):
def _validate_(self, level):
if not tarfile.is_tarfile(str(self)):
raise ValidationError(
"Unable to load pickled file (not a tar file).")
# https://github.com/qiime2/q2-types/issues/49
# This is effectively an internal format - it isn't registered with the
# plugin, but rather used as part of a dir fmt. This format also exists
# in q2-feature-classifier.
class JSONFormat(model.TextFileFormat):
def _validate_(self, level):
with self.open() as fh:
try:
json.load(fh)
except json.JSONDecodeError as e:
raise ValidationError(e)
class SampleEstimatorDirFmt(model.DirectoryFormat):
version_info = model.File('sklearn_version.json', format=JSONFormat)
sklearn_pipeline = model.File('sklearn_pipeline.tar', format=PickleFormat)
class PredictionsFormat(model.TextFileFormat):
def _validate(self, n_records=None):
with self.open() as fh:
# validate header
# for now we will not validate any information in the header,
# since the name of the predicted column should be flexible. The
# header name written by methods in q2-sample-classifier will be
# "predicted-*", but this should also accommodate user-defined
# column names.
line = fh.readline()
# validate body
has_data = False
for line_number, line in enumerate(fh, start=2):
# we want to strip each cell, not the original line
# otherwise empty cells are dropped, causing a TypeError
cells = [c.strip() for c in line.split('\t')]
_validate_record_len(cells, line_number, 2)
has_data = True
if n_records is not None and (line_number - 1) >= n_records:
break
_validate_file_not_empty(has_data)
def _validate_(self, level):
record_count_map = {'min': 5, 'max': None}
self._validate(record_count_map[level])
PredictionsDirectoryFormat = model.SingleFileDirectoryFormat(
'PredictionsDirectoryFormat', 'predictions.tsv',
PredictionsFormat)
class _MultiColumnNumericFormat(model.TextFileFormat):
def _validate(self, n_records=None):
with self.open() as fh:
# validate header
# for now we will not validate any information in the header,
# since column names, count etc are frequently unique to individual
# estimators. Let's keep this flexible.
line = fh.readline()
# validate body
has_data = False
for line_number, line in enumerate(fh, start=2):
# we want to strip each cell, not the original line
# otherwise empty cells are dropped, causing a TypeError
cells = [c.strip() for c in line.split('\t')]
if len(cells) < 2:
raise ValidationError(
"Expected data record to be TSV with two or more "
"fields. Detected {0} fields at line {1}:\n\n{2!r}"
.format(len(cells), line_number, cells))
# all values (except row name) should be numbers
try:
[float(c) for c in cells[1:]]
except ValueError:
raise ValidationError(
"Columns must contain only numeric values. "
"A non-numeric value ({0!r}) was detected at line "
"{1}.".format(cells[1], line_number))
has_data = True
if n_records is not None and (line_number - 1) >= n_records:
break
_validate_file_not_empty(has_data)
def _validate_(self, level):
record_count_map = {'min': 5, 'max': None}
self._validate(record_count_map[level])
class ImportanceFormat(_MultiColumnNumericFormat):
pass
ImportanceDirectoryFormat = model.SingleFileDirectoryFormat(
'ImportanceDirectoryFormat', 'importance.tsv',
ImportanceFormat)
class ProbabilitiesFormat(_MultiColumnNumericFormat):
pass
ProbabilitiesDirectoryFormat = model.SingleFileDirectoryFormat(
'ProbabilitiesDirectoryFormat', 'class_probabilities.tsv',
ProbabilitiesFormat)
TrueTargetsDirectoryFormat = model.SingleFileDirectoryFormat(
'TrueTargetsDirectoryFormat', 'true_targets.tsv',
PredictionsFormat)
|