File: reader.py

package info (click to toggle)
grammalecte 2.1.2%2Bds2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 70,192 kB
  • sloc: python: 17,988; javascript: 9,417; java: 4,716; xml: 144; makefile: 28; sh: 11
file content (87 lines) | stat: -rw-r--r-- 2,732 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!python3
# Just a file for one-shot scripts

import os
import sys
import re

import graphspell.ibdawg as ibdawg

oDict = ibdawg.IBDAWG("fr-allvars.json")


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine
    else:
        print("# Error: file not found.")

# --------------------------------------------------------------------------------------------------

def listUnknownWords (spf):
    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sLine in readFile(spfSrc):
            sLine = sLine.strip()
            if sLine:
                for sWord in sLine.split():
                    if not oDict.isValid(sWord):
                        hDst.write(sWord+"\n")

# --------------------------------------------------------------------------------------------------

def createLexStatFile (spf, dStat):
    dWord = {}
    for i, sLine in enumerate(readFile(spf)):
        if not sLine.startswith("#"):
            sWord = sLine.strip()
            if sWord not in dWord:
                dWord[sWord] = dStat.get(sWord, 0)
        print(i, end="\r")

    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sWord, nVal in sorted(dWord.items(), key=lambda x: (x[1], x[0]), reverse=True):
            if not oDict.isValid(sWord):
                hDst.write(sWord + " " + str(nVal) + "\n")


def readStatFile (spf, dStat):
    print("read stats: " + spf)
    for sLine in readFile(spf):
        if not sLine.startswith("#"):
            sWord, sCount = sLine.split()
            dStat[sWord] = dStat.get(sWord, 0) + int(sCount)
    return dStat


def readStatFilesAndCreateLexicon ():
    dStat = {}
    readStatFile("stats1.txt", dStat)
    readStatFile("stats2.txt", dStat)
    readStatFile("stats3.txt", dStat)
    createLexStatFile("propositions.txt", dStat)

# --------------------------------------------------------------------------------------------------

def isMoreThanOneSetInList (lSet):
    aFirst = lSet.pop(0)
    for aSet in lSet:
        if aSet != aFirst:
            return True
    return False

def filterLinesWithWordsWithDifferentStems (spf):
    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sLine in readFile(spf):
            lStemSet = [ set(oDict.stem(sWord))  for sWord in sLine.strip().split()]
            if isMoreThanOneSetInList(lStemSet):
                hDst.write(sLine)

def filterHomophonicWords ():
    filterLinesWithWordsWithDifferentStems("homophones.txt")

# --------------------------------------------------------------------------------------------------

if __name__ == '__main__' :
    filterHomophonicWords()