File: test_svm.crm

package info (click to toggle)
crm114 20100106-9
links: PTS
area: main
in suites: buster
size: 3,184 kB
sloc: ansic: 34,910; sh: 617; makefile: 578; lisp: 208
file content (133 lines) | stat: -rwxr-xr-x 4,043 bytes
parent folder | download | duplicates (6)
#! /usr/bin/crm
#
#	test_svm.crm - test SVM filter

# Copyright 2009 William S. Yerazunis.
# This file is under GPLv3, as described in COPYING.

window
isolate (:index_file:)    /index/  # files to run, in TREC index format
				#  that is, with one data per line like:
				#  "ham"|"spam" relative/file/name
isolate (:P:) <default>   //   # if set with --P, do by paragraphs
isolate (:L:) <default>   /100000/ # run microgroomed learn modulo this cycle
isolate (:S:) <default>   /0/  # number of msgs to skip off start of index
isolate (:N:) <default>   /1000/
isolate (:M:) <default>   /1000/
isolate (:ham_svm:)	  /ham.svm/
isolate (:spam_svm:)      /spam.svm/
isolate (:ham_vs_spam:)   /ham_vs_spam.svmhyp/

isolate (:total: :lino: :messago: :filo: :label: :stat:)
isolate (:nham:) /0/
isolate (:nspam:) /0/
isolate (:correct:) /0/

#output /N = ':*:N:'\n/
eval (:total:)/:*:M: /
input (:whole_index_file:) [:*:index_file:]

#   Skip first :S: files
output /Skipping first :*:S: files.\n/
{
	eval /:@: :*:S: - 1 :/ (:S:)
	match <fromend> [:whole_index_file:] /[^\n]*\n/ (:lino:)
        eval /:@: :*:S: > 0 :/
        liaf
}

output /Learning the next :*:N: files.\n/
{
	eval /:@: :*:N: > 0 :/
	#output /N = ':*:N:'\n/
	{
		match <fromend> [:whole_index_file:] /[^\n]*\n/ (:lino:)
		#output /:*:lino:/
		match <nomultiline> [:lino:] /([[:graph:]]+)[[:space:]]+([[:graph:]]+)/(:: :label: :filo:)
		input (:messago:) [:*:filo: 0 10000]
		#output /learning file :*:filo:\n/
		{
			match /ham/ [:label:]
			#output /This is a ham!\n/
			eval (:nham:) /:@: :*:nham: + 1:/
			{
			    match [:P:] /SET/
			    match [:messago:] <fromend> (:one_para:) \
				/([[:graph:]]+[[:space:]]+){7}.*?\n\n/
			    learn (:*:ham_svm:) < svm unigram unique> \
                             [:one_para:] /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/
			    liaf
			}
			alius
                        learn (:*:ham_svm:) < svm unigram unique > \
                             [:messago:] /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/

		}
		alius
		{
			match /spam/ [:label:]
			#output /This is a spam!\n/
			eval (:nspam:) /:@: :*:nspam: + 1:/
			{
			   match [:P:] /SET/
			   match [:messago:] <fromend> (:one_para:) \
				/([[:graph:]]+[[:space:]]+){7}.*?\n\n/
  			   learn (:*:spam_svm:) < svm unigram unique > \
 			    [:one_para:] /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/
			   liaf
			}
			alius
			learn (:*:spam_svm:) < svm unigram unique > \
                           [:messago:] /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/
		}
	}
	eval (:N:) /:@: :*:N: - 1:/
	{
	    eval /:@: :*:N: % :*:L: = 0 :/
 	    learn (:*:ham_svm: |:*:spam_svm:|:*:ham_vs_spam:) \
	    < svm unigram unique microgroom > \
	    /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1.0/
	    output /*/
	}
	liaf
}

output /total number of training spam is :*:nspam:\ntotal number of training ham is :*:nham:\n/
output /Calculating ideal SVM to separate the classes.\n/
learn (:*:ham_svm: |:*:spam_svm:|:*:ham_vs_spam:) < svm unigram unique > /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/
output /Starting classify on next :*:M: files.\n/
##start classify
{
	eval /:@: :*:M: > 0 :/
	{
		match <fromend> [:whole_index_file:] /[^\n]*\n/ (:lino:)
		output /start classify :*:lino:/
		match <nomultiline> [:lino:] /([[:graph:]]+)[[:space:]]+([[:graph:]]+)/(:: :label: :filo:)
		input (:messago:) [:*:filo: 0 10000]
		{
			classify (ham.svm|spam.svm|ham_vs_spam.svmhyp) (:stat:)[:messago:] < svm unigram unique > /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/
		}
		output /:*:stat:\n/
		{
			match [:stat:] /succeeds/
			match [:lino:] /ham/
			eval (:correct:) /:@: :*:correct: + 1:/
		}
		alius
		{
			match [:stat:] /fails/
			match [:lino:] /spam/
			eval (:correct:) /:@: :*:correct: + 1:/
		}

	}
	eval (:M:) /:@: :*:M: - 1:/

	liaf
}
output /total=:*:total:\n/
output /correct=:*:correct:\n/
eval (:correct:) /:@: (:*:correct:) \/ (:*:total:) : /
eval (:correct:) /:@: :*:correct: * 100 :/
output /accuracy = :*:correct:%/
#output /total number of predicted texts = :*:total:,number of correctly classified texts = :*:correct:\n/