File: sang.c

package info (click to toggle)
sufary 2.1b3-4
  • links: PTS
  • area: main
  • in suites: potato
  • size: 2,032 kB
  • ctags: 968
  • sloc: ansic: 5,926; perl: 1,378; tcl: 771; makefile: 728; sh: 664; cpp: 192
file content (205 lines) | stat: -rw-r--r-- 5,195 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
/*--------------------------------------------------------------*
  sang  ---  Suffix Array Ѥ N-gram פȤץ

  USAGE   sang -n NUM -t NUM FILENAME
  OPTION
  -n NUM : NUM  n-gram  n ꤹ롣
  -t NUM : threshold: NUMʲ٤ΤΤɽʤ

  n-gram ˤϲԤϴޤޤʤ

[¹]
> cat test
ABCBACABBAACABCABCACABACABBACBACACAAABACCAB
> makeary -q test                  arrayեκ
> sang -n 6 -t 1 test              6-gram ٤ 1 礭Τɽ
2 ACABBA
2 BACABB
> sang -n 3 -t 4 test              trigram ٤ 4 礭Τɽ
6 ACA
5 BAC
6 CAB

  971029  Version 0.1  ãͺ(tatuo-y@is,aist-nara.ac.jp)
  980327  Version 0.2  NEW sa_sel() Τν, BUGFIX:ǸN-gram̤ɽ
 *--------------------------------------------------------------*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "sufary.h"

static void usage(void);
void do_sang(char *fname, int ng, int threshold);
void hyouji(char *nstr, int n_ctr, int threshold);

char nstr[1000]; /* n-gram Ե */

/**********************************************
  ᥤᥤᥤᥤᥤᥤᥤ
 **********************************************/
int main(int argc, char *argv[])
{
  int ng; /* n-gram  n */
  int threshold = 0; /* thresold */
  
  /******  ******/
  if(argc <= 1){
    usage();
    exit(1);
  }
  while (argc > 1){
    /* ץ */
    if (argv[1][0] == '-')
      switch (argv[1][1]){
	/* n-gramn */
      case 'n':
        ng = atoi(argv[2]);
        argc--; argv++;
        break;
	/* threshold: ʲ٤ΤΤɽʤ */
      case 't': 
        threshold = atoi(argv[2]);
        argc--; argv++;
        break;
	/* 顼 */
      default : 
	usage();
	exit(1);
      }
    else{
      /* n-gramץ롼Ƥ */
      do_sang(argv[1], ng, threshold);
    }
    /* Ĥΰ */
    argc--; argv++;
  }
  /* λ */
  return 0;
}

/**********************************************
 * void do_sang(char *fname, int ng, int threshold)
 * 
 * purpose
 *   n-gramפȤ
 *
 * parameters
 *   fname : ե̾
 *   ng : -gram
 *   threshold : ɽ(ʲ٤ΤΤɽ)
 *
 * return value
 *   ʤ
 **********************************************/
void do_sang(char *fname, int ng, int threshold)
{
  char *s;
  long tmp;
  int n_ctr = 0;
  SUFARY *ary;
  char *p;

  /****** ƥ/쥤ե򳫤 ******/
  if ((ary = sa_openfiles(fname,NULL)) == NULL){
    printf("argument ignored.\n");
    usage();
    exit(1);
  }

  nstr[0] = '\0';

/* printf("%ld %ld\n",sa_bottom(ary),sa_top(ary)); */

  /* aryƤǤФƥ롼 */
  for (tmp = sa_bottom(ary); tmp <= sa_top(ary); tmp++){
    /* ƥȤ */
    p = sa_aryidx2txtptr(ary, tmp);
    /* Ʊn-gram䤹 */
    if(strncmp(p,nstr,ng) == 0){
      n_ctr++;
    }else{
      if(*nstr != '\0' && !strstr(nstr,"\n"))
	hyouji(nstr,n_ctr,threshold);/* ͰʾΤΤʸɻߤɽ */
      /* ĤΥȥ */
      strncpy(nstr,p,ng);
      n_ctr = 0;
    }
  }

  hyouji(nstr,n_ctr,threshold);/* ͰʾΤΤʸɻߤɽ */

  /*  */
  sa_closefiles(ary);
  
  /* λ */
  return;
}


/**********************************************
 * void hyouji(char *nstr, int n_ctr, int threshold)
 * 
 * purpose
 *   n-gramɽ
 *
 * parameters
 *   nstr : n-gramʸؤΥݥ󥿡
 *   n_ctr : n-gramʸνиĿ
 *   threshold : ɽ(ʲ٤ΤΤɽ)
 *
 * return value
 *   ʤ
 **********************************************/
void hyouji(char *nstr, int n_ctr, int threshold)
{
  /* ͰʾΤΤʸɻߤɽ */
  if(n_ctr >= threshold){
    mojibakebousi(nstr,0);
    printf("%d %s\n",n_ctr+1,nstr);
  }
}


/**************
   Ȥɽ
   **************/
void usage(){
  fprintf(stderr, "Version 0.2  970327  YAMASITA Tatuo (tatuo-y@cl.aist-nara.ac.jp)\nUSAGE   sang -n NUM -t NUM FILENAME\nOPTION\n  -n NUM : N for N-gram\n  -t NUM : threshold\n");
}


/* from show.c(for 'array') */
/**********************************************
 * void mojibakebousi(char *buf);
 *
 * purpose
 *    ʸλijĤ֤üʸĤ֤
 * parameters
 *    buf:  о
 *    haba: ɤβʸ褫ɤफ
 * return value
 *
 * description
 *
 **********************************************/
int mojibakebousi(char *buf,int haba){
  int i, pre_hankaku, post_hankaku;

  pre_hankaku = 0; /* ɤȾʸο */
  post_hankaku = 0; /* ɤȾʸο */
  for(i = 0; i < strlen(buf); i++) {
    if((unsigned char)buf[i] < 0x80){ /* Ⱦʸ */
      if(i < haba) pre_hankaku++;
      else post_hankaku++;
      if((unsigned char)buf[i] < 0x20) buf[i] = '!'; /* üʸ !  */
    }
  }
  /* ʸɻ: Ƭ˴θʬ褿Ȥ % ˤ롥 */
  if(pre_hankaku % 2 == 1) buf[0] = '%';
  /* : Ǹ˴ʬ褿Ȥ % ˤ롥 */
  if(post_hankaku % 2 == 1 && strlen(buf) % 2 == 0) buf[i-1] = '%';
  if(post_hankaku % 2 == 0 && strlen(buf) % 2 == 1) buf[i-1] = '%';

  return pre_hankaku;
}