1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
|
#!/usr/bin/env bash
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
DATADIR=${DATADIR:-data}
report_error() {
echo "Error on line $1 of $0"
}
myshuf() {
perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
}
normalize_text() {
tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
}
set -e
trap 'report_error $LINENO' ERR
mkdir -p "${DATADIR}"
# Unsupervised datasets
data_result="${DATADIR}/rw_queries.txt"
if [ ! -f "$data_result" ]
then
cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "$data_result" || rm -f "$data_result"
fi
data_result="${DATADIR}/enwik9.zip"
if [ ! -f "$data_result" ] || \
[ $(md5sum "$data_result" | cut -f 1 -d ' ') != "3e773f8a1577fda2e27f871ca17f31fd" ]
then
wget -c http://mattmahoney.net/dc/enwik9.zip -P "${DATADIR}" || rm -f "$data_result"
unzip "$data_result" -d "${DATADIR}" || rm -f "$data_result"
fi
data_result="${DATADIR}/fil9"
if [ ! -f "$data_result" ]
then
perl wikifil.pl "${DATADIR}/enwik9" > "$data_result" || rm -f "$data_result"
fi
data_result="${DATADIR}/rw/rw.txt"
if [ ! -f "$data_result" ]
then
wget -c https://nlp.stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}"
unzip "${DATADIR}/rw.zip" -d "${DATADIR}" || rm -f "$data_result"
fi
# Supervised datasets
# Each datasets comes with a .train and a .test to measure performance
echo "Downloading dataset dbpedia"
data_result="${DATADIR}/dbpedia_csv.tar.gz"
if [ ! -f "$data_result" ] || \
[ $(md5sum "$data_result" | cut -f 1 -d ' ') != "8139d58cf075c7f70d085358e73af9b3" ]
then
wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "$data_result"
tar -xzvf "$data_result" -C "${DATADIR}"
fi
data_result="${DATADIR}/dbpedia.train"
if [ ! -f "$data_result" ]
then
cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "$data_result" || rm -f "$data_result"
fi
data_result="${DATADIR}/dbpedia.test"
if [ ! -f "$data_result" ]
then
cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "$data_result" || rm -f "$data_result"
fi
echo "Downloading dataset tatoeba for langid"
data_result="${DATADIR}"/langid/all.txt
if [ ! -f "$data_result" ]
then
mkdir -p "${DATADIR}"/langid
wget http://downloads.tatoeba.org/exports/sentences.tar.bz2 -O "${DATADIR}"/langid/sentences.tar.bz2
tar xvfj "${DATADIR}"/langid/sentences.tar.bz2 --directory "${DATADIR}"/langid || exit 1
awk -F"\t" '{print"__label__"$2" "$3}' < "${DATADIR}"/langid/sentences.csv | shuf > "$data_result"
fi
data_result="${DATADIR}/langid.train"
if [ ! -f "$data_result" ]
then
tail -n +10001 "${DATADIR}"/langid/all.txt > "$data_result"
fi
data_result="${DATADIR}/langid.valid"
if [ ! -f "$data_result" ]
then
head -n 10000 "${DATADIR}"/langid/all.txt > "$data_result"
fi
echo "Downloading cooking dataset"
data_result="${DATADIR}"/cooking/cooking.stackexchange.txt
if [ ! -f "$data_result" ]
then
mkdir -p "${DATADIR}"/cooking/
wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz -O "${DATADIR}"/cooking/cooking.stackexchange.tar.gz
tar xvzf "${DATADIR}"/cooking/cooking.stackexchange.tar.gz --directory "${DATADIR}"/cooking || exit 1
cat "${DATADIR}"/cooking/cooking.stackexchange.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > "${DATADIR}"/cooking/cooking.preprocessed.txt
fi
data_result="${DATADIR}"/cooking.train
if [ ! -f "$data_result" ]
then
head -n 12404 "${DATADIR}"/cooking/cooking.preprocessed.txt > "${DATADIR}"/cooking.train
fi
data_result="${DATADIR}"/cooking.valid
if [ ! -f "$data_result" ]
then
tail -n 3000 "${DATADIR}"/cooking/cooking.preprocessed.txt > "${DATADIR}"/cooking.valid
fi
echo "Checking for YFCC100M"
data_result="${DATADIR}"/YFCC100M/train
if [ ! -f "$data_result" ]
then
echo 'Download YFCC100M, unpack it and place train into the following path: '"$data_result"
echo 'You can download YFCC100M at :'"https://fasttext.cc/docs/en/dataset.html"
echo 'After you download this, run the script again'
exit 1
fi
data_result="${DATADIR}"/YFCC100M/test
if [ ! -f "$data_result" ]
then
echo 'Download YFCC100M, unpack it and place test into the following path: '"$data_result"
echo 'You can download YFCC100M at :'"https://fasttext.cc/docs/en/dataset.html"
echo 'After you download this, run the script again'
exit 1
fi
DATASET=(
ag_news
sogou_news
dbpedia
yelp_review_polarity
yelp_review_full
yahoo_answers
amazon_review_full
amazon_review_polarity
)
ID=(
0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
)
# Small datasets first
for i in {0..0}
do
echo "Downloading dataset ${DATASET[i]}"
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
then
wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
fi
done
# Large datasets require a bit more work due to the extra request page
for i in {1..7}
do
echo "Downloading dataset ${DATASET[i]}"
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
then
curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
fi
done
|