File: word_punct.py

package info (click to toggle)

golang-github-jdkato-prose 1.1.0%2Bgit20171031.e27abfd-2

links: PTS, VCS
area: main
in suites: buster
size: 12,848 kB
sloc: python: 115; makefile: 55; sh: 21

file content (14 lines) | stat: -rw-r--r-- 326 bytes

parent folder | download | duplicates (3)

import json
import os

from nltk.tokenize import wordpunct_tokenize

with open(os.path.join('testdata', 'treebank_sents.json')) as d:
    data = json.load(d)

words = []
for s in data:
    words.append(wordpunct_tokenize(s))

with open(os.path.join('testdata', 'word_punct.json'), 'w') as f:
    json.dump(words, f, indent=4)