File: Classify.hs

package info (click to toggle)
hscolour 1.25-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 224 kB
  • sloc: haskell: 1,171; sh: 15; makefile: 8
file content (135 lines) | stat: -rw-r--r-- 4,980 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
module Language.Haskell.HsColour.Classify
  ( TokenType(..)
  , tokenise
  ) where

import Data.Char (isSpace, isUpper, isLower, isDigit)
import Data.List

-- | Lex Haskell source code into an annotated token stream, without
--   discarding any characters or layout.
tokenise :: String -> [(TokenType,String)]
tokenise str = 
    let chunks = glue . chunk $ str 
    in markDefs $ map (\s-> (classify s,s)) chunks

markDefs :: [(TokenType, String)] -> [(TokenType, String)]
markDefs [] = []
markDefs ((Varid, s) : rest) = (Definition, s) : continue rest
markDefs ((Varop, ">") : (Space, " ") : (Varid, d) : rest) =
    (Varop, ">") : (Space, " ") : (Definition, d) : continue rest
markDefs rest = continue rest

continue rest 
    = let (thisLine, nextLine) = span (/= (Space, "\n")) rest
      in
        case nextLine of
          [] -> thisLine
          ((Space, "\n"):nextLine') -> (thisLine ++ ((Space, "\n") : (markDefs nextLine')))


-- Basic Haskell lexing, except we keep whitespace.
chunk :: String -> [String]
chunk []    = []
chunk ('\r':s) = chunk s -- get rid of DOS newline stuff
chunk ('\n':s) = "\n": chunk s
chunk (c:s) | isLinearSpace c
            = (c:ss): chunk rest where (ss,rest) = span isLinearSpace s
chunk ('{':'-':s) = let (com,s') = nestcomment 0 s
                    in ('{':'-':com) : chunk s'
chunk s = case Prelude.lex s of
              []             -> [head s]: chunk (tail s) -- e.g. inside comment
              ((tok@('-':'-':_),rest):_)
                  | all (=='-') tok -> (tok++com): chunk s'
                                       where (com,s') = eolcomment rest
              ((tok,rest):_) -> tok: chunk rest

isLinearSpace c = c `elem` " \t\f" -- " \t\xa0"

-- Glue sequences of tokens into more useful blobs
glue (q:".":n:rest) | isUpper (head q)	-- qualified names
                    = glue ((q++"."++n): rest)
glue ("`":rest) =				-- `varid` -> varop
  case glue rest of
    (qn:"`":rest) -> ("`"++qn++"`"): glue rest
    _             -> "`": glue rest
glue (s:ss)       | all (=='-') s && length s >=2	-- eol comment
                  = (s++concat c): glue rest
                  where (c,rest) = break ('\n'`elem`) ss
--glue ("{":"-":ss)  = ("{-"++c): glue rest	-- nested comment
--                  where (c,rest) = nestcomment 0 ss
glue ("(":ss) = case rest of
                ")":rest -> ("(" ++ concat tuple ++ ")") : glue rest
                _         -> "(" : glue ss
              where (tuple,rest) = span (==",") ss
glue ("[":"]":ss) = "[]" : glue ss
glue ("\n":"#":ss)= "\n" : ('#':concat line) : glue rest
                  where (line,rest) = break ('\n'`elem`) ss
glue (s:ss)       = s: glue ss
glue []           = []

-- Deal with comments.
nestcomment :: Int -> String -> (String,String)
nestcomment n ('{':'-':ss) | n>=0 = (("{-"++cs),rm)
                                  where (cs,rm) = nestcomment (n+1) ss
nestcomment n ('-':'}':ss) | n>0  = (("-}"++cs),rm)
                                  where (cs,rm) = nestcomment (n-1) ss
nestcomment n ('-':'}':ss) | n==0 = ("-}",ss)
nestcomment n (s:ss)       | n>=0 = ((s:cs),rm)
                                  where (cs,rm) = nestcomment n ss
nestcomment n [] = ([],[])

eolcomment :: String -> (String,String)
eolcomment s@('\n':_) = ([], s)
eolcomment ('\r':s)   = eolcomment s
eolcomment (c:s)      = (c:cs, s') where (cs,s') = eolcomment s
eolcomment []         = ([],[])

-- | Classification of tokens as lexical entities
data TokenType =
  Space | Keyword | Keyglyph | Layout | Comment | Conid | Varid |
  Conop | Varop   | String   | Char   | Number  | Cpp   | Error |
  Definition
  deriving (Eq,Show)

classify :: String -> TokenType
classify s@(h:t)
    | isSpace h              = Space
    | all (=='-') s          = Comment
    | "--" `isPrefixOf` s
      && any isSpace s       = Comment		-- not fully correct
    | "{-" `isPrefixOf` s    = Comment
    | s `elem` keywords      = Keyword
    | s `elem` keyglyphs     = Keyglyph
    | s `elem` layoutchars   = Layout
    | isUpper h              = Conid
    | s == "[]"              = Conid
    | h == '(' && isTupleTail t = Conid
    | h == '#'               = Cpp
    | isLower h              = Varid
    | h `elem` symbols       = Varop
    | h==':'                 = Conop
    | h=='`'                 = Varop
    | h=='"'                 = String
    | h=='\''                = Char
    | isDigit h              = Number
    | otherwise              = Error
classify _ = Space

isTupleTail [')'] = True
isTupleTail (',':xs) = isTupleTail xs
isTupleTail _ = False


-- Haskell keywords
keywords =
  ["case","class","data","default","deriving","do","else","forall"
  ,"if","import","in","infix","infixl","infixr","instance","let","module"
  ,"newtype","of","qualified","then","type","where","_"
  ,"foreign","ccall","as","safe","unsafe","family"]
keyglyphs =
  ["..","::","=","\\","|","<-","->","@","~","=>","[","]"]
layoutchars =
  map (:[]) ";{}(),"
symbols =
  "!#$%&*+./<=>?@\\^|-~"