File: build_corpus.sh

package info (click to toggle)
simdjson 4.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 31,400 kB
  • sloc: cpp: 195,760; ansic: 20,954; sh: 1,126; python: 885; makefile: 47; ruby: 25; javascript: 13
file content (18 lines) | stat: -rwxr-xr-x 477 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#!/bin/sh
#
# Builds a corpus from all small json files in the source directory.
# The files are renamed to the sha1 of their content, and suffixed
# .json. The files are zipped into a flat file named corpus.zip

set -eu

tmp=$(mktemp -d)

root=$(readlink -f "$(dirname "$0")/..")

find $root -type f -size -4k -name "*.json" | while read -r json; do
 cp "$json" "$tmp"/$(sha1sum < "$json" |cut -f1 -d' ').json
done

zip --quiet --junk-paths -r corpus.zip "$tmp"
rm -rf "$tmp"