1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
|
"""Script to download the movie review dataset"""
import os
import tarfile
from contextlib import closing
try:
from urllib import urlopen
except ImportError:
from urllib.request import urlopen
URL = ("http://www.cs.cornell.edu/people/pabo/"
"movie-review-data/review_polarity.tar.gz")
ARCHIVE_NAME = URL.rsplit('/', 1)[1]
DATA_FOLDER = "txt_sentoken"
if not os.path.exists(DATA_FOLDER):
if not os.path.exists(ARCHIVE_NAME):
print("Downloading dataset from %s (3 MB)" % URL)
opener = urlopen(URL)
with open(ARCHIVE_NAME, 'wb') as archive:
archive.write(opener.read())
print("Decompressing %s" % ARCHIVE_NAME)
with closing(tarfile.open(ARCHIVE_NAME, "r:gz")) as archive:
archive.extractall(path='.')
os.remove(ARCHIVE_NAME)
|