File: treebank_words.py

package info (click to toggle)
golang-github-jdkato-prose 1.1.0%2Bgit20171031.e27abfd-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 12,848 kB
  • sloc: python: 115; makefile: 55; sh: 21
file content (21 lines) | stat: -rw-r--r-- 543 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import json
import os

from nltk.tokenize import TreebankWordTokenizer, sent_tokenize

t = TreebankWordTokenizer()
with open(os.path.join('testdata', 'tokenize.json')) as d:
    data = json.load(d)

words = []
sents = []
for text in data:
    for s in sent_tokenize(text):
        sents.append(s)
        words.append(t.tokenize(s))

with open(os.path.join('testdata', 'treebank_words.json'), 'w') as f:
    json.dump(words, f, indent=4)

with open(os.path.join('testdata', 'treebank_sents.json'), 'w') as f:
    json.dump(sents, f, indent=4)