1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
|
(* $Id: xstr_split.ml,v 1.2 1999/07/06 21:32:09 gerd Exp $
* ----------------------------------------------------------------------
*
*)
type cclass =
CData
| CIgnore
| CSeparator
| CIgnoreOrSeparator
;;
let split_string ignoreset ignoreleft ignoreright separators (* s *) =
(* 'ignoreset': Characters that are ignored before and after the
* separator
* 'ignoreleft': true means to ignore characters from 'ignoreset' at
* the beginning of the string
* 'ignoreright': true means to ignore characters from 'ignoreset' at
* the end of the string
* 'separators': list of separating strings
* 's': string to split
*
* EXAMPLES:
*
* - split_string " " true true [ "," ] "a, b, c ,d "
* = [ "a"; "b"; "c"; "d" ]
* - split_string "" true true [ "," ] "a, b, c ,d "
* = [ "a"; " b"; " c "; "d " ]
* - split_string " " false false [ "," ] "a, b, c ,d "
* = [ "a"; "b"; "c"; "d " ]
*)
let character_classification =
let a = Array.create 256 CData in
let sepchars =
List.flatten
(List.map (fun sep -> if sep <> "" then [sep.[0]] else []) separators)
in
let ignorechars =
let l = ref [] in
for k = 0 to String.length ignoreset - 1 do
l := ignoreset.[k] :: !l
done;
!l
in
List.iter
(fun c ->
a.( Char.code c ) <- CSeparator)
sepchars;
List.iter
(fun c ->
let code = Char.code c in
if a.( code ) = CSeparator then
a.( code ) <- CIgnoreOrSeparator
else
a.( code ) <- CIgnore)
ignorechars;
a
in
fun s ->
let l = String.length s in
let rec split_over_word i_wordbeg i_wordend i_current =
(* i_wordbeg <= i_wordend: i_current has not yet reached the next
* separator. i_wordbeg is the position of
* the first CData character, i_wordend the
* position after the last CData Character of
* the word.
*)
if i_current < l then begin
let code = Char.code (s.[i_current]) in
let cl = character_classification.(code) in
match cl with
CData -> (* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
| CIgnore -> split_over_word i_wordbeg i_wordend (i_current+1)
| _ ->
let rec find_sep sepl =
match sepl with
[] ->
if cl = CSeparator then (* just as CData *)
(* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
else (* just as CIgnore *)
split_over_word i_wordbeg i_wordend (i_current+1)
| sep :: sepl' ->
let lsep = String.length sep in
if i_current + lsep <= l &
String.sub s i_current lsep = sep then
(* found separator *)
String.sub s i_wordbeg (i_wordend - i_wordbeg) ::
split_after_word (i_current + lsep) (i_current + lsep)
else
find_sep sepl'
in
find_sep separators
end
else
(* i_current >= l *)
if ignoreright then
[ String.sub s i_wordbeg (i_wordend - i_wordbeg) ]
else
[ String.sub s i_wordbeg (i_current - i_wordbeg) ]
and split_after_word i_wordbeg i_current =
(* i_wordbeg > i_wordend: i_current is just after the separator and
* searches the next word beginning
*)
if i_current < l then begin
let code = Char.code (s.[i_current]) in
let cl = character_classification.(code) in
match cl with
CData ->
(* split i_current (i_current+1) (i_current+1) *)
fast_skip_word i_current (i_current+1)
| (CIgnore|CIgnoreOrSeparator) ->
split_after_word i_wordbeg (i_current+1)
| CSeparator ->
let rec find_sep sepl =
match sepl with
[] ->
(* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
| sep :: sepl' ->
let lsep = String.length sep in
if i_current + lsep < l &
String.sub s i_current lsep = sep then
(* found separator *)
"" ::
split_after_word (i_current + lsep) (i_current + lsep)
else
find_sep sepl'
in
find_sep separators
end
else
(* i_current >= l *)
if i_wordbeg = 0 then
[] (* not any word found *)
else
[ "" ]
(* Now some frequent special cases *)
and fast_skip_word i_wordbeg i_current =
(* i_wordbeg <= i_current = i_wordend *)
if i_current < l-1 then begin
let code1 = Char.code (s.[i_current]) in
let cl1 = character_classification.(code1) in
match cl1 with
CData ->
begin
let code2 = Char.code (s.[i_current+1]) in
let cl2 = character_classification.(code2) in
match cl2 with
CData -> fast_skip_word i_wordbeg (i_current+2)
| CIgnore -> split_over_word i_wordbeg (i_current+1) (i_current+2)
| _ -> (* continue with the general routine *)
split_over_word i_wordbeg (i_current+1) (i_current+1)
end
| CIgnore -> split_over_word i_wordbeg i_current (i_current+1)
| _ -> (* continue with the general routine *)
split_over_word i_wordbeg i_current i_current
end
else split_over_word i_wordbeg i_current i_current
in
if ignoreleft then
split_after_word 0 0
else
split_over_word 0 0 0
;;
(* ======================================================================
* History:
*
* $Log: xstr_split.ml,v $
* Revision 1.2 1999/07/06 21:32:09 gerd
* Tried to optimize the function; but currently without success.
* There should be deeper analysis -- on the other hand, splitting seems
* to be relative fast compared with the Str splitting function.
* Perhaps the improvements have an effect on machines with bigger caches.
*
* Revision 1.1 1999/06/27 23:03:38 gerd
* Initial revision.
*
*
*)
|