File: XlsxCell.h

package info (click to toggle)
r-cran-readxl 1.3.0-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 4,488 kB
  • sloc: ansic: 4,565; cpp: 3,401; makefile: 2
file content (327 lines) | stat: -rw-r--r-- 9,729 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#ifndef READXL_XLSXCELL_
#define READXL_XLSXCELL_

#include <Rcpp.h>
#include "rapidxml.h"
#include "ColSpec.h"
#include "XlsxString.h"
#include "utils.h"

// Key reference for understanding the structure of the XML is
// ECMA-376 (http://www.ecma-international.org/publications/standards/Ecma-376.htm)
// Section and page numbers below refer to the 5th edition October 2016
// 18.3.1.4   c           (Cell)       [p1593]
// 18.3.1.96  v           (Cell Value) [p1707]
// 18.18.11   ST_CellType (Cell Type)  [p2451]

class XlsxCell {
  rapidxml::xml_node<>* cell_;
  std::pair<int,int> location_;
  CellType type_;

public:

  // if possible, provide guess at row and column based on position within xml
  XlsxCell(rapidxml::xml_node<>* cell, int row = -1, int col = -1):
  cell_(cell)
  {
    rapidxml::xml_attribute<>* ref = cell_->first_attribute("r");
    if (ref == NULL) {
      location_ = std::make_pair(row, col);
    } else {
      location_ = parseRef(ref->value());
    }
    type_ = CELL_UNKNOWN;
  }

  XlsxCell(std::pair<int,int> loc)
  {
    cell_ = NULL;
    location_ = loc;
    type_ = CELL_BLANK;
  }

  int row() const {
    return location_.first;
  }

  int col() const {
    return location_.second;
  }

  CellType type() const {
    return type_;
  }

  void inferType(const StringSet& na,
                 const bool trimWs,
                 const std::vector<std::string>& stringTable,
                 const std::set<int>& dateFormats) {
    // 1. Review of Excel's declared cell types, then
    // 2. Summary of how Excel's cell types map to our CellType enum
    //
    // this table refers to the value of the t attribute of a cell
    // 18.18.11   ST_CellType (Cell Type)  [p2451]
    // This simple type is restricted to the values listed in the following table:
    // -------------------------------------------------------------------------
    // Enumeration Value          Description
    // -------------------------------------------------------------------------
    // b (Boolean)                Cell containing a boolean.
    // d (Date)                   Cell contains a date in the ISO 8601 format.
    // e (Error)                  Cell containing an error.
    // inlineStr (Inline String)  Cell containing an (inline) rich string, i.e.,
    //                            one not in the shared string table. If this
    //                            cell type is used, then the cell value is in
    //                            the is element rather than the v element in
    //                            the cell (c element).
    // n (Number)                 Cell containing a number.
    // s (Shared String)          Cell containing a shared string.
    // str (String)               Cell containing a formula string.
    //
    // We map Excel's cell types to the CellType enum based on declared type
    // and contents.
    //
    // CELL_BLANK
    //   inlineStr cell and (string is na or string can't be found)
    //   cell has no v node and is not an inlineStr cell
    //   v->value() is na
    //   error cell
    //   shared string cell and string is na
    //
    // CELL_LOGICAL
    //   Boolean cell and its value (TRUE or FALSE) is not in na
    //
    // CELL_DATE
    //   numeric cell (t attr is "n" or does not exist) with a date format
    //
    // CELL_NUMERIC
    //   numeric cell (t attr is "n" or does not exist) with no format or a
    //   non-date format
    //
    // CELL_TEXT
    //   inlineStr cell and string is found and string is not na
    //   ISO 8601 date cell (t attr is "d") <- we're not sure this exists IRL
    //   shared string cell and string is not na
    //   formula string cell and string is not na
    //   anything that is not explicitly addressed elsewhere

    if (type_ != CELL_UNKNOWN) {
      return;
    }

    rapidxml::xml_attribute<>* t = cell_->first_attribute("t");
    rapidxml::xml_node<>* v = cell_->first_node("v");

    // inlineStr (Inline String)  Cell containing an (inline) rich string
    if (t != NULL && strncmp(t->value(), "inlineStr", 9) == 0) {
      // must do this first, because inlineStr cells do not have a v node
      // and the check just below would otherwise make them all CELL_BLANK
      rapidxml::xml_node<>* is = cell_->first_node("is");
      std::string inline_string;
      if (parseString(is, &inline_string)) {
        type_ = na.contains(inline_string, trimWs) ? CELL_BLANK : CELL_TEXT;
      } else {
        type_ = CELL_BLANK;
      }
      return;
    }

    // s (Shared String)          Cell containing a shared string.
    if (t != NULL && strncmp(t->value(), "s", 5) == 0) {
      int id = atoi(v->value());
      const std::string& string = stringTable.at(id);
      type_ = na.contains(string, trimWs) ? CELL_BLANK : CELL_TEXT;
      return;
    }

    if (v == NULL || na.contains(v->value(), trimWs)) {
      type_ = CELL_BLANK;
      return;
    }

    // from here on, no need for explicit NA check

    // n (Number)                 Cell containing a number.
    if (t == NULL || strncmp(t->value(), "n", 5) == 0) {
      rapidxml::xml_attribute<>* s = cell_->first_attribute("s");
      int format = (s == NULL) ? -1 : atoi(s->value());
      type_ = (dateFormats.count(format) > 0) ? CELL_DATE : CELL_NUMERIC;
      return;
    }

    // b (Boolean)                Cell containing a boolean.
    if (strncmp(t->value(), "b", 5) == 0) {
      type_ = CELL_LOGICAL;
      return;
    }

    // d (Date)                   Cell contains a date in the ISO 8601 format.
    if (strncmp(t->value(), "d", 5) == 0) {
      // Hadley:
      // Does excel use this? Regardless, don't have cross-platform ISO8601
      // parser (yet) so need to return as text
      // Jenny:
      // Not entirely sure what this is about. I've never seen one IRL.
      type_ = CELL_TEXT;
      return;
    }

    // e (Error)                  Cell containing an error.
    if (strncmp(t->value(), "e", 5) == 0) {
      type_ = CELL_BLANK;
      return;
    }

    // str (String)               Cell containing a formula string.
    if (strncmp(t->value(), "str", 5) == 0) {
      type_ = CELL_TEXT;
      return;
    }

    Rcpp::warning("Unrecognized cell type at %s: '%s'",
                  cellPosition(row(), col()), t->value());
  }

  std::string asStdString(const std::vector<std::string>& stringTable,
                          const bool trimWs) const {
    if (cell_ == NULL) {
      return "";
    }

    rapidxml::xml_node<>* v = cell_->first_node("v");
    rapidxml::xml_attribute<>* t = cell_->first_attribute("t");

    switch(type_) {

    case CELL_UNKNOWN:
    case CELL_BLANK:
      return "";

    case CELL_LOGICAL:
      return atoi(v->value()) ? "TRUE" : "FALSE";

    case CELL_DATE:
      // not ideal for a date but will have to do ... one day: asDateString()?
    case CELL_NUMERIC:
      return std::string(v->value());

    case CELL_TEXT:
    {
      std::string out_string;

      // inlineStr
      rapidxml::xml_node<>* is = cell_->first_node("is");
      if (is != NULL) {
        if (parseString(is, &out_string)) {
          return trimWs ? trim(out_string) : out_string;
        } else {
          return "NA";
        }
      }

      // shared string
      if (strncmp(t->value(), "s", 5) == 0) {
        out_string = stringFromTable(v->value(), stringTable);
        return trimWs ? trim(out_string) : out_string;
      }

      //   formula string cell or
      //   the mythical ISO 8601 date cell
      out_string = std::string(v->value());
      return trimWs ? trim(out_string) : out_string;
    }

    default:
      Rcpp::warning("Unrecognized cell type at %s", cellPosition(row(), col()));
      return "";
  }
  }

  Rcpp::RObject asCharSxp(const std::vector<std::string>& stringTable,
                          const bool trimWs) const {
    std::string out_string = asStdString(stringTable, trimWs);
    return out_string.empty() ? NA_STRING : Rf_mkCharCE(out_string.c_str(), CE_UTF8);
  }

  int asLogical() const {
    switch(type_) {

    case CELL_UNKNOWN:
    case CELL_BLANK:
    case CELL_DATE:
    case CELL_TEXT:
      return NA_LOGICAL;

    case CELL_LOGICAL:
    case CELL_NUMERIC:
    {
      rapidxml::xml_node<>* v = cell_->first_node("v");
      return atoi(v->value()) != 0;
    }

    default:
      Rcpp::warning("Unrecognized cell type at %s", cellPosition(row(), col()));
      return NA_LOGICAL;
    }
  }

  double asDouble() const {
    switch(type_) {

    case CELL_UNKNOWN:
    case CELL_BLANK:
    case CELL_TEXT:
      return NA_REAL;

    case CELL_LOGICAL:
    case CELL_DATE:
    case CELL_NUMERIC:
    {
      rapidxml::xml_node<>* v = cell_->first_node("v");
      return atof(v->value());
    }

    default:
      Rcpp::warning("Unrecognized cell type at %s", cellPosition(row(), col()));
      return NA_REAL;
    }
  }

  double asDate(bool is1904) const {
    switch(type_) {

    case CELL_UNKNOWN:
    case CELL_BLANK:
    case CELL_LOGICAL:
    case CELL_TEXT:
      return NA_REAL;

    case CELL_DATE:
    case CELL_NUMERIC:
    {
      rapidxml::xml_node<>* v = cell_->first_node("v");
      return POSIXctFromSerial(atof(v->value()), is1904);
    }

    default:
      Rcpp::warning("Unrecognized cell type at %s", cellPosition(row(), col()));
      return NA_REAL;
    }
  }

private:

  std::string stringFromTable(const char* val,
                              const std::vector<std::string>& stringTable) const {
    int id = atoi(val);
    if (id < 0 || id >= (int) stringTable.size()) {
      Rcpp::warning("Invalid string id at %s: %i", cellPosition(row(), col()), id);
      return "";
    }
    const std::string& string = stringTable.at(id);
    return string;
  }

};

#endif