File: data.sh

package info (click to toggle)
fasttext 0.9.2%2Bds-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,952 kB
  • sloc: cpp: 5,459; python: 2,427; javascript: 635; sh: 621; makefile: 106; xml: 81; perl: 43
file content (69 lines) | stat: -rwxr-xr-x 2,456 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env bash
#
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
set -e
DATADIR=data/

if [ ! -d "$DATADIR" ]; then
  mkdir $DATADIR
fi

cd $DATADIR
echo "preparing WN18"
#wget -P . https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:wordnet-mlj12.tar.gz
#mv fetch.php\?media\=en\:wordnet-mlj12.tar.gz wordnet-mlj12.tar.gz
wget -P . https://github.com/mana-ysh/knowledge-graph-embeddings/raw/master/dat/wordnet-mlj12.tar.gz
tar -xzvf wordnet-mlj12.tar.gz
DIR=wordnet-mlj12
for f in ${DIR}/wordnet-ml*.txt;
do
  fn=${DIR}/ft_$(basename $f)
  awk '{print "__label__"$1,"0_"$2, $3;print $1,"1_"$2," __label__"$3}' < ${f} > ${fn};
done
cat ${DIR}/ft_* > ${DIR}/ft_wordnet-mlj12-full.txt
cat ${DIR}/ft_*train.txt ${DIR}/ft_*valid.txt > ${DIR}/ft_wordnet-mlj12-valid+train.txt

echo "preparing FB15K"
#wget https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:fb15k.tgz
#mv fetch.php\?media\=en\:fb15k.tgz fb15k.tgz
wget https://github.com/mana-ysh/knowledge-graph-embeddings/raw/master/dat/fb15k.tgz
tar -xzvf fb15k.tgz
DIR=FB15k/
for f in ${DIR}/freebase*.txt;
do
  fn=${DIR}/ft_$(basename $f)
  echo $f " --> " $fn
  awk '{print "__label__"$1,"0_"$2, $3;print $1,"1_"$2," __label__"$3}' < ${f} > ${fn};
done
cat ${DIR}/ft_* > ${DIR}/ft_freebase_mtr100_mte100-full.txt
cat ${DIR}/ft_*train.txt ${DIR}/ft_*valid.txt > ${DIR}/ft_freebase_mtr100_mte100-valid+train.txt

echo "preparing FB15K-237"
wget https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.2.zip
unzip FB15K-237.2.zip
DIR=Release/
for f in train.txt test.txt valid.txt
do
  fn=${DIR}/ft_$(basename $f)
  echo $f " --> " $fn
  awk -F "\t" '{print "__label__"$1,"0_"$2, $3;print $1,"1_"$2," __label__"$3}' < ${DIR}/${f} > ${fn};
done
cat ${DIR}/ft_*.txt > ${DIR}/ft_full.txt
cat ${DIR}/ft_train.txt ${DIR}/ft_valid.txt > ${DIR}/ft_valid+train.txt

echo "preparing SVO"
wget . https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:svo-tensor-dataset.tar.gz
mv fetch.php?media=en:svo-tensor-dataset.tar.gz svo-tensor-dataset.tar.gz
tar -xzvf svo-tensor-dataset.tar.gz
DIR=SVO-tensor-dataset
for f in ${DIR}/svo_data*.dat;
do
  fn=${DIR}/ft_$(basename $f)
  awk '{print "0_"$1,"1_"$3,"__label__"$2;}' < ${f} > ${fn};
done
cat ${DIR}/ft_*train*.dat ${DIR}/ft_*valid*.dat > ${DIR}/ft_svo_data-valid+train.dat