1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
|
#ifndef READXL_XLSXCELL_
#define READXL_XLSXCELL_
#include <Rcpp.h>
#include "rapidxml.h"
#include "ColSpec.h"
#include "XlsxString.h"
#include "utils.h"
// Key reference for understanding the structure of the XML is
// ECMA-376 (http://www.ecma-international.org/publications/standards/Ecma-376.htm)
// Section and page numbers below refer to the 5th edition October 2016
// 18.3.1.4 c (Cell) [p1593]
// 18.3.1.96 v (Cell Value) [p1707]
// 18.18.11 ST_CellType (Cell Type) [p2451]
class XlsxCell {
rapidxml::xml_node<>* cell_;
std::pair<int,int> location_;
CellType type_;
public:
// if possible, provide guess at row and column based on position within xml
XlsxCell(rapidxml::xml_node<>* cell, int row = -1, int col = -1):
cell_(cell)
{
rapidxml::xml_attribute<>* ref = cell_->first_attribute("r");
if (ref == NULL) {
location_ = std::make_pair(row, col);
} else {
location_ = parseRef(ref->value());
}
type_ = CELL_UNKNOWN;
}
XlsxCell(std::pair<int,int> loc)
{
cell_ = NULL;
location_ = loc;
type_ = CELL_BLANK;
}
int row() const {
return location_.first;
}
int col() const {
return location_.second;
}
CellType type() const {
return type_;
}
void inferType(const StringSet& na,
const bool trimWs,
const std::vector<std::string>& stringTable,
const std::set<int>& dateFormats) {
// 1. Review of Excel's declared cell types, then
// 2. Summary of how Excel's cell types map to our CellType enum
//
// this table refers to the value of the t attribute of a cell
// 18.18.11 ST_CellType (Cell Type) [p2451]
// This simple type is restricted to the values listed in the following table:
// -------------------------------------------------------------------------
// Enumeration Value Description
// -------------------------------------------------------------------------
// b (Boolean) Cell containing a boolean.
// d (Date) Cell contains a date in the ISO 8601 format.
// e (Error) Cell containing an error.
// inlineStr (Inline String) Cell containing an (inline) rich string, i.e.,
// one not in the shared string table. If this
// cell type is used, then the cell value is in
// the is element rather than the v element in
// the cell (c element).
// n (Number) Cell containing a number.
// s (Shared String) Cell containing a shared string.
// str (String) Cell containing a formula string.
//
// We map Excel's cell types to the CellType enum based on declared type
// and contents.
//
// CELL_BLANK
// inlineStr cell and (string is na or string can't be found)
// cell has no v node and is not an inlineStr cell
// v->value() is na
// error cell
// shared string cell and string is na
//
// CELL_LOGICAL
// Boolean cell and its value (TRUE or FALSE) is not in na
//
// CELL_DATE
// numeric cell (t attr is "n" or does not exist) with a date format
//
// CELL_NUMERIC
// numeric cell (t attr is "n" or does not exist) with no format or a
// non-date format
//
// CELL_TEXT
// inlineStr cell and string is found and string is not na
// ISO 8601 date cell (t attr is "d") <- we're not sure this exists IRL
// shared string cell and string is not na
// formula string cell and string is not na
// anything that is not explicitly addressed elsewhere
if (type_ != CELL_UNKNOWN) {
return;
}
rapidxml::xml_attribute<>* t = cell_->first_attribute("t");
rapidxml::xml_node<>* v = cell_->first_node("v");
// inlineStr (Inline String) Cell containing an (inline) rich string
if (t != NULL && strncmp(t->value(), "inlineStr", 9) == 0) {
// must do this first, because inlineStr cells do not have a v node
// and the check just below would otherwise make them all CELL_BLANK
rapidxml::xml_node<>* is = cell_->first_node("is");
std::string inline_string;
if (parseString(is, &inline_string)) {
type_ = na.contains(inline_string, trimWs) ? CELL_BLANK : CELL_TEXT;
} else {
type_ = CELL_BLANK;
}
return;
}
// s (Shared String) Cell containing a shared string.
if (t != NULL && strncmp(t->value(), "s", 5) == 0) {
int id = atoi(v->value());
const std::string& string = stringTable.at(id);
type_ = na.contains(string, trimWs) ? CELL_BLANK : CELL_TEXT;
return;
}
if (v == NULL || na.contains(v->value(), trimWs)) {
type_ = CELL_BLANK;
return;
}
// from here on, no need for explicit NA check
// n (Number) Cell containing a number.
if (t == NULL || strncmp(t->value(), "n", 5) == 0) {
rapidxml::xml_attribute<>* s = cell_->first_attribute("s");
int format = (s == NULL) ? -1 : atoi(s->value());
type_ = (dateFormats.count(format) > 0) ? CELL_DATE : CELL_NUMERIC;
return;
}
// b (Boolean) Cell containing a boolean.
if (strncmp(t->value(), "b", 5) == 0) {
type_ = CELL_LOGICAL;
return;
}
// d (Date) Cell contains a date in the ISO 8601 format.
if (strncmp(t->value(), "d", 5) == 0) {
// Hadley:
// Does excel use this? Regardless, don't have cross-platform ISO8601
// parser (yet) so need to return as text
// Jenny:
// Not entirely sure what this is about. I've never seen one IRL.
type_ = CELL_TEXT;
return;
}
// e (Error) Cell containing an error.
if (strncmp(t->value(), "e", 5) == 0) {
type_ = CELL_BLANK;
return;
}
// str (String) Cell containing a formula string.
if (strncmp(t->value(), "str", 5) == 0) {
type_ = CELL_TEXT;
return;
}
Rcpp::warning("Unrecognized cell type at %s: '%s'",
cellPosition(row(), col()), t->value());
}
std::string asStdString(const std::vector<std::string>& stringTable,
const bool trimWs) const {
if (cell_ == NULL) {
return "";
}
rapidxml::xml_node<>* v = cell_->first_node("v");
rapidxml::xml_attribute<>* t = cell_->first_attribute("t");
switch(type_) {
case CELL_UNKNOWN:
case CELL_BLANK:
return "";
case CELL_LOGICAL:
return atoi(v->value()) ? "TRUE" : "FALSE";
case CELL_DATE:
// not ideal for a date but will have to do ... one day: asDateString()?
case CELL_NUMERIC:
return std::string(v->value());
case CELL_TEXT:
{
std::string out_string;
// inlineStr
rapidxml::xml_node<>* is = cell_->first_node("is");
if (is != NULL) {
if (parseString(is, &out_string)) {
return trimWs ? trim(out_string) : out_string;
} else {
return "NA";
}
}
// shared string
if (strncmp(t->value(), "s", 5) == 0) {
out_string = stringFromTable(v->value(), stringTable);
return trimWs ? trim(out_string) : out_string;
}
// formula string cell or
// the mythical ISO 8601 date cell
out_string = std::string(v->value());
return trimWs ? trim(out_string) : out_string;
}
default:
Rcpp::warning("Unrecognized cell type at %s", cellPosition(row(), col()));
return "";
}
}
Rcpp::RObject asCharSxp(const std::vector<std::string>& stringTable,
const bool trimWs) const {
std::string out_string = asStdString(stringTable, trimWs);
return out_string.empty() ? NA_STRING : Rf_mkCharCE(out_string.c_str(), CE_UTF8);
}
int asLogical() const {
switch(type_) {
case CELL_UNKNOWN:
case CELL_BLANK:
case CELL_DATE:
case CELL_TEXT:
return NA_LOGICAL;
case CELL_LOGICAL:
case CELL_NUMERIC:
{
rapidxml::xml_node<>* v = cell_->first_node("v");
return atoi(v->value()) != 0;
}
default:
Rcpp::warning("Unrecognized cell type at %s", cellPosition(row(), col()));
return NA_LOGICAL;
}
}
double asDouble() const {
switch(type_) {
case CELL_UNKNOWN:
case CELL_BLANK:
case CELL_TEXT:
return NA_REAL;
case CELL_LOGICAL:
case CELL_DATE:
case CELL_NUMERIC:
{
rapidxml::xml_node<>* v = cell_->first_node("v");
return atof(v->value());
}
default:
Rcpp::warning("Unrecognized cell type at %s", cellPosition(row(), col()));
return NA_REAL;
}
}
double asDate(bool is1904) const {
switch(type_) {
case CELL_UNKNOWN:
case CELL_BLANK:
case CELL_LOGICAL:
case CELL_TEXT:
return NA_REAL;
case CELL_DATE:
case CELL_NUMERIC:
{
rapidxml::xml_node<>* v = cell_->first_node("v");
return POSIXctFromSerial(atof(v->value()), is1904);
}
default:
Rcpp::warning("Unrecognized cell type at %s", cellPosition(row(), col()));
return NA_REAL;
}
}
private:
std::string stringFromTable(const char* val,
const std::vector<std::string>& stringTable) const {
int id = atoi(val);
if (id < 0 || id >= (int) stringTable.size()) {
Rcpp::warning("Invalid string id at %s: %i", cellPosition(row(), col()), id);
return "";
}
const std::string& string = stringTable.at(id);
return string;
}
};
#endif
|