File: CollectorGuess.cpp

package info (click to toggle)
r-cran-readr 2.1.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,688 kB
  • sloc: cpp: 4,020; ansic: 1,811; makefile: 2
file content (161 lines) | stat: -rw-r--r-- 3,657 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#include "cpp11/R.hpp"
#include "cpp11/list.hpp"
#include "cpp11/strings.hpp"

#include "DateTime.h"
#include "DateTimeParser.h"
#include "LocaleInfo.h"
#include "QiParsers.h"
#include "utils.h"

typedef bool (*canParseFun)(const std::string&, LocaleInfo* pLocale);

bool canParse(
    const cpp11::strings& x, const canParseFun& canParse, LocaleInfo* pLocale) {
  for (const auto & i : x) {
    if (i == NA_STRING) {
      continue;
    }

    if (i.size() == 0) {
      continue;
    }

    if (!canParse(std::string(i), pLocale)) {
      return false;
    }
  }
  return true;
}

bool allMissing(const cpp11::strings& x) {
  for (const auto & i : x) {
    if (i != NA_STRING && i.size() > 0) {
      return false;
    }
  }
  return true;
}

bool isLogical(const std::string& x, LocaleInfo* /*unused*/) {
  const char* const str = x.data();
  bool res = isLogical(str, str + x.size());
  return res;
}

bool isNumber(const std::string& x, LocaleInfo* pLocale) {
  // Leading zero not followed by decimal mark
  if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_) {
    return false;
  }

  double res = 0;
  std::string::const_iterator begin = x.begin();

  std::string::const_iterator end = x.end();

  bool ok = parseNumber(
      pLocale->decimalMark_, pLocale->groupingMark_, begin, end, res);
  return ok && begin == x.begin() && end == x.end();
}

bool isInteger(const std::string& x, LocaleInfo* /*unused*/) {
  // Leading zero
  if (x[0] == '0' && x.size() > 1) {
    return false;
  }

  double res = 0;
  std::string::const_iterator begin = x.begin();

  std::string::const_iterator end = x.end();

  return parseInt(begin, end, res) && begin == end;
}

bool isDouble(const std::string& x, LocaleInfo* pLocale) {
  // Leading zero not followed by decimal mark
  if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_) {
    return false;
  }

  double res = 0;
  const char* begin = x.c_str();
  const char* end = begin + x.size();

  return parseDouble(pLocale->decimalMark_, begin, end, res) &&
         end == begin + x.size();
}

bool isTime(const std::string& x, LocaleInfo* pLocale) {
  DateTimeParser parser(pLocale);

  parser.setDate(x.c_str());
  return parser.parseLocaleTime();
}

bool isDate(const std::string& x, LocaleInfo* pLocale) {
  DateTimeParser parser(pLocale);

  parser.setDate(x.c_str());
  return parser.parseLocaleDate();
}

static bool isDateTime(const std::string& x, LocaleInfo* pLocale) {
  DateTimeParser parser(pLocale);

  parser.setDate(x.c_str());
  bool ok = parser.parseISO8601();

  if (!ok) {
    return false;
  }

  if (!parser.compactDate()) {
    return true;
  }

  // Values like 00014567 are unlikely to be dates, so don't guess
  return parser.year() > 999;
}

[[cpp11::register]] std::string collectorGuess(
    const cpp11::strings& input,
    const cpp11::list& locale_,
    bool guessInteger) {
  LocaleInfo locale(static_cast<SEXP>(locale_));

  if (input.size() == 0) {
    return "character";
  }

  if (allMissing(input)) {
    return "logical";
  }

  // Work from strictest to most flexible
  if (canParse(input, isLogical, &locale)) {
    return "logical";
  }
  if (guessInteger && canParse(input, isInteger, &locale)) {
    return "integer";
  }
  if (canParse(input, isDouble, &locale)) {
    return "double";
  }
  if (canParse(input, isNumber, &locale)) {
    return "number";
  }
  if (canParse(input, isTime, &locale)) {
    return "time";
  }
  if (canParse(input, isDate, &locale)) {
    return "date";
  }
  if (canParse(input, isDateTime, &locale)) {
    return "datetime";
  }

  // Otherwise can always parse as a character
  return "character";
}