File: wordfreq.py

package info (click to toggle)
ipyparallel 8.8.0-6
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 12,412 kB
  • sloc: python: 21,991; javascript: 267; makefile: 29; sh: 28
file content (65 lines) | stat: -rw-r--r-- 1,983 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""Count the frequencies of words in a string"""


def wordfreq(text, is_filename=False):
    """Return a dictionary of words and word counts in a string."""
    if is_filename:
        with open(text) as f:
            text = f.read()
    freqs = {}
    for word in text.split():
        lword = word.lower()
        freqs[lword] = freqs.get(lword, 0) + 1
    return freqs


def print_wordfreq(freqs, n=10):
    """Print the n most common words and counts in the freqs dict."""

    words, counts = freqs.keys(), freqs.values()
    items = zip(counts, words)
    items = sorted(items, reverse=True)
    for count, word in items[:n]:
        print(word, count)


def wordfreq_to_weightsize(
    worddict, minsize=25, maxsize=50, minalpha=0.5, maxalpha=1.0
):
    mincount = min(worddict.itervalues())
    maxcount = max(worddict.itervalues())
    weights = {}
    for k, v in worddict.iteritems():
        w = (v - mincount) / (maxcount - mincount)
        alpha = minalpha + (maxalpha - minalpha) * w
        size = minsize + (maxsize - minsize) * w
        weights[k] = (alpha, size)
    return weights


def tagcloud(worddict, n=10, minsize=25, maxsize=50, minalpha=0.5, maxalpha=1.0):
    import random

    from matplotlib import pyplot as plt

    worddict = wordfreq_to_weightsize(worddict, minsize, maxsize, minalpha, maxalpha)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_position([0.0, 0.0, 1.0, 1.0])
    plt.xticks([])
    plt.yticks([])

    words = worddict.keys()
    alphas = [v[0] for v in worddict.values()]
    sizes = [v[1] for v in worddict.values()]
    items = zip(alphas, sizes, words)
    items.sort(reverse=True)
    for alpha, size, word in items[:n]:
        # xpos = random.normalvariate(0.5, 0.3)
        # ypos = random.normalvariate(0.5, 0.3)
        xpos = random.uniform(0.0, 1.0)
        ypos = random.uniform(0.0, 1.0)
        ax.text(xpos, ypos, word.lower(), alpha=alpha, fontsize=size)
    ax.autoscale_view()
    return ax