File: 08-topmine_ngrammer.py

package info (click to toggle)
python-pattern 2.6%2Bgit20180818-4.1
  • links: PTS
  • area: main
  • in suites: sid, trixie
  • size: 95,160 kB
  • sloc: python: 28,135; xml: 15,085; javascript: 5,810; makefile: 194
file content (73 lines) | stat: -rw-r--r-- 1,913 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from __future__ import print_function
from __future__ import unicode_literals

from builtins import str, bytes, dict, int

import os
import sys
import codecs
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

import pattern.text as text_module
from pattern.text.en.wordlist import STOPWORDS

paths = []
for f in os.listdir('./texts'):
    paths.append('./texts/' + f)

texts = []
for p in paths:
    with codecs.open(p, "rb", encoding='latin-1') as f:
        if sys.version_info[0] < 3:
            texts.append(f.read())
        else:
            texts.append(str(f.read()))

ng = text_module.train_topmine_ngrammer(texts, threshhold=1, regexp="[^a-zA-Z0-9]")
ngrams = text_module.topmine_ngramms(texts[0], ng, threshhold=1)



print("\n")
bigrams = []
trigrams = []
for key in ngrams.keys():
    if len(key.split("_")) == 2:
        bigrams.append(key)
    elif len(key.split("_")) == 3:
        trigrams.append(key)

print("Extracted {} bigrams:\n".format(len(bigrams)))
print(bigrams)
print("\n")

print("Extracted {} trigrams:\n".format(len(trigrams)))
print(trigrams)
print("\n")


# as we can see the extracted ngrams contain many stopwords, so, it's important to delete all
# stopwords before applying the algorythm

ng = text_module.train_topmine_ngrammer(texts, threshhold=1, regexp="[^a-zA-Z0-9]", stopwords=STOPWORDS)
ngrams = text_module.topmine_ngramms(texts[0], ng, threshhold=1)


# as we can see the extracted ngrams contain many stopwords, so, it's important to delete all
# stopwords before applying the algorythm
print("\n")
bigrams = []
trigrams = []
for key in ngrams.keys():
    if len(key.split("_")) == 2:
        bigrams.append(key)
    elif len(key.split("_")) == 3:
        trigrams.append(key)

print("Extracted {} bigrams:\n".format(len(bigrams)))
print(bigrams)
print("\n")

print("Extracted {} trigrams:\n".format(len(trigrams)))
print(trigrams)
print("\n")