File: parse-unidata.tcl

package info (click to toggle)
jimtcl 0.81%2Bdfsg0-2
links: PTS, VCS
area: main
in suites: bookworm
size: 15,416 kB
sloc: ansic: 175,569; tcl: 5,456; sh: 4,814; cpp: 1,671; makefile: 269
file content (143 lines) | stat: -rw-r--r-- 3,175 bytes
parent folder | download | duplicates (3)
#!/usr/bin/env tclsh

# Generate UTF-8 case mapping tables
#
# (c) 2010 Steve Bennett <steveb@workware.net.au>
#
# See LICENCE for licence details.
#/

# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
# and http://unicode.org/Public/UNIDATA/EastAsianWidth.txt
# to generate case mapping and display width tables
set map(lower) {}
set map(upper) {}
set map(title) {}
set map(combining) {}
set map(wide) {}

set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt \[EastAsianWidth.txt\]"
set do_width 0

if {[lindex $argv 0] eq "-width"} {
	set do_width 1
	set argv [lrange $argv 1 end]
}

if {[llength $argv] ni {1 2}} {
	puts stderr $USAGE
	exit 1
}

lassign $argv unicodefile widthfile

set f [open $unicodefile]
while {[gets $f buf] >= 0} {
	# Remove any trailing whitespace, especially errant CR
	set buf [string trim $buf]
	set title ""
	set lower ""
	set upper ""
	lassign [split $buf ";"] code name class x x x x x x x x x upper lower title
	set codex [string tolower 0x$code]
	if {[string match M* $class]} {
		if {![info exists combining]} {
			set combining $codex
		}
		continue
	} elseif {[info exists combining]} {
		lappend map(combining) $combining $codex
		unset combining
	}
	if {$codex <= 0x7f} {
		continue
	}
	if {$codex > 0xffff} {
		break
	}
	if {![string match L* $class]} {
		continue
	}
	if {$upper ne ""} {
		lappend map(upper) $codex [string tolower 0x$upper]
	}
	if {$lower ne ""} {
		lappend map(lower) $codex [string tolower 0x$lower]
	}
	if {$title ne "" && $title ne $upper} {
		if {$title eq $code} {
			set title 0
		}
		lappend map(title) $codex [string tolower 0x$title]
	}
}
close $f

proc output-int-pairs {list} {
	set n 0
	foreach {v1 v2} $list {
		puts -nonewline "\t{ $v1, $v2 },"
		if {[incr n] % 4 == 0} {
			puts ""
		}
	}
	if {$n % 4} {
		puts ""
	}
}

# Merges adjacent ranges in a list of ranges (lower upper lower upper ...)
proc combine-adjacent-ranges {list} {
	set newlist {}
	foreach {lower upper} $list {
		if {[info exists prev_upper]} {
			if {$lower == $prev_upper + 1} {
				# combine these
				set prev_upper $upper
				continue
			} else {
				# can't combine
				lappend newlist $prev_lower $prev_upper
			}
		}
		set prev_lower $lower
		set prev_upper $upper
	}
	# Now add the last range
	lappend newlist $prev_lower $prev_upper
	return $newlist
}

foreach type {upper lower title} {
	puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
	output-int-pairs $map($type)
	puts "\};\n"
}

if {$do_width} {
	set f [open $widthfile]
	while {[gets $f buf] >= 0} {
		# Remove any trailing whitespace, especially errant CR
		set buf [string trim $buf]
		if {[regexp {^([0-9A-Fa-f.]+);W} $buf -> range]} {
			set range [string tolower $range]
			lassign [split $range .] lower - upper
			if {$upper eq ""} {
				set upper $lower
			}
			lappend map(wide) 0x$lower 0x$upper
		}
	}
	close $f
}

foreach type {combining wide} {
	puts "static const struct utf8range unicode_range_$type\[\] = \{"
	if {$do_width} {
		output-int-pairs [combine-adjacent-ranges $map($type)]
	} else {
		# Just produce empty width tables in this case
		output-int-pairs {}
	}
	puts "\};\n"
}