File: spacestotabs.py

package info (click to toggle)
biojava6-live 6.1.0%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 109,224 kB
  • sloc: java: 245,617; xml: 27,410; python: 64; makefile: 39; sh: 31
file content (103 lines) | stat: -rwxr-xr-x 3,526 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#! /usr/bin/env python
#
# Script to conservatively convert initial spaces to tabs
#
# The script first guesses the number of spaces used (e.g. 2,3,4, or 8).
# It then passes this to the unexpand utility for the actual conversion.
import sys,os
import optparse
import subprocess

def space_distribution(filename):
    """Reads the specified file and creates a histogram of the number of initial
    spaces in each line of the file.

    Returns a dictionary mapping n to the number of lines with n initial spaces
    """
    spaces = {}
    with open(filename,'r') as file:
        for line in file.readlines():
            s = 0
            for c in line:
                if c == ' ':
                    s += 1
                elif c == '*':
                    s -= 1
                    break
                else:
                    break
            spaces.setdefault(s,0)
            spaces[s] += 1
    return spaces

def max_compatible(distribution, threshold=.8, possible=[8,4,3,2]):
    """Guess the most likely number of spaces per tab for a file

    distribution: a dictionary giving the number of lines with a particular
        number of initial spaces
    threshold: Fraction of lines that must be correctly spaced
    possible: list of possible tabstops to consider

    Returns the highest tabstop such that at least threshold-fraction of the
        lines contain initial spaces that are a multiple of that tabstop.
        Returns None if no possiblities meet the threshold, or if the file
        did not contain sufficient lines to determine the tabstop
        unambiguously.
    """
    for poss in sorted(possible,reverse=True):
        compat = 0
        incompat = 0
        for spaces, count in distribution.items():
            # Only assess lines with spaces
            if spaces == 0:
                continue
            if spaces%poss == 0:
                compat += count
            else:
                incompat += count
        # unambiguous case
        if compat > 0 and incompat == 0:
            return poss
        # require sufficient lines with spaces
        if threshold < 1 and (compat+incompat)*(1-threshold) < 2:
            continue
        if compat >= (compat+incompat)*threshold:
            return poss

def unexpand(filename, spaces):
    """Calls the unexpand command to convert initial spaces into tabs
    """
    args = ["unexpand","--first-only","-t",str(spaces),filename]
    print " ".join(args)
    bits = subprocess.check_output(args)
    with open(filename,'wb') as out:
        out.write(bits)


if __name__ == "__main__":
    parser = optparse.OptionParser( usage="usage: python %prog [options] filenames..." )
    parser.add_option("-t","--threshold",help="fraction of lines with compatible spacing",
        dest="threshold",default=0.8, type='float',action='store')
    parser.add_option("-x","--unexpand",help="Unexpand spaces in the input files",
        dest="unexpand",default=False, action='store_true')
    (options,args) = parser.parse_args()

    if len(args) < 1:
        parser.print_usage()
        parser.exit("Error. No files")


    for f in args:
        dist = space_distribution(f)
        # skip tabbed files
        if dist.keys() == [0]:
            continue
        c = max_compatible(dist,options.threshold)
        if c is not None:
            if options.unexpand:
                unexpand(f,c)
            else:
                print "Match\t%s\t%d\t%s"%(f,c,dist)
        else:
            print "Unclear\t%s\t?\t%s"%(f,dist)