1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
|
/*-
* Copyright (c) 2019 Christos Zoulas
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Parse CSV object serialization format (RFC-4180, RFC-7111)
*/
#ifndef TEST
#include "file.h"
#ifndef lint
FILE_RCSID("@(#)$File: is_csv.c,v 1.13 2023/07/17 16:08:17 christos Exp $")
#endif
#include <string.h>
#include "magic.h"
#else
#include <sys/types.h>
#endif
#ifdef DEBUG
#include <stdio.h>
#define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
#else
#define DPRINTF(fmt, ...)
#endif
/*
* if CSV_LINES == 0:
* check all the lines in the buffer
* otherwise:
* check only up-to the number of lines specified
*
* the last line count is always ignored if it does not end in CRLF
*/
#ifndef CSV_LINES
#define CSV_LINES 10
#endif
static int csv_parse(const unsigned char *, const unsigned char *);
static const unsigned char *
eatquote(const unsigned char *uc, const unsigned char *ue)
{
int quote = 0;
while (uc < ue) {
unsigned char c = *uc++;
if (c != '"') {
// We already got one, done.
if (quote) {
return --uc;
}
continue;
}
if (quote) {
// quote-quote escapes
quote = 0;
continue;
}
// first quote
quote = 1;
}
return ue;
}
static int
csv_parse(const unsigned char *uc, const unsigned char *ue)
{
size_t nf = 0, tf = 0, nl = 0;
while (uc < ue) {
switch (*uc++) {
case '"':
// Eat until the matching quote
uc = eatquote(uc, ue);
break;
case ',':
nf++;
break;
case '\n':
DPRINTF("%zu %zu %zu\n", nl, nf, tf);
nl++;
#if CSV_LINES
if (nl == CSV_LINES)
return tf != 0 && tf == nf;
#endif
if (tf == 0) {
// First time and no fields, give up
if (nf == 0)
return 0;
// First time, set the number of fields
tf = nf;
} else if (tf != nf) {
// Field number mismatch, we are done.
return 0;
}
nf = 0;
break;
default:
break;
}
}
return tf && nl >= 2;
}
#ifndef TEST
int
file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
const char *code)
{
const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
const unsigned char *ue = uc + b->flen;
int mime = ms->flags & MAGIC_MIME;
if (!looks_text)
return 0;
if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
return 0;
if (!csv_parse(uc, ue))
return 0;
if (mime == MAGIC_MIME_ENCODING)
return 1;
if (mime) {
if (file_printf(ms, "text/csv") == -1)
return -1;
return 1;
}
if (file_printf(ms, "CSV %s%stext", code ? code : "",
code ? " " : "") == -1)
return -1;
return 1;
}
#else
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
#include <err.h>
int
main(int argc, char *argv[])
{
int fd;
struct stat st;
unsigned char *p;
if ((fd = open(argv[1], O_RDONLY)) == -1)
err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
if (fstat(fd, &st) == -1)
err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
if ((p = CAST(char *, malloc(st.st_size))) == NULL)
err(EXIT_FAILURE, "Can't allocate %jd bytes",
(intmax_t)st.st_size);
if (read(fd, p, st.st_size) != st.st_size)
err(EXIT_FAILURE, "Can't read %jd bytes",
(intmax_t)st.st_size);
printf("is csv %d\n", csv_parse(p, p + st.st_size));
return 0;
}
#endif
|