1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
|
import json
import os
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize
t = TreebankWordTokenizer()
with open(os.path.join('testdata', 'tokenize.json')) as d:
data = json.load(d)
words = []
sents = []
for text in data:
for s in sent_tokenize(text):
sents.append(s)
words.append(t.tokenize(s))
with open(os.path.join('testdata', 'treebank_words.json'), 'w') as f:
json.dump(words, f, indent=4)
with open(os.path.join('testdata', 'treebank_sents.json'), 'w') as f:
json.dump(sents, f, indent=4)
|