File: html2dic.c

package info (click to toggle)
dirb 2.22%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 2,200 kB
  • sloc: ansic: 1,672; sh: 1,119; makefile: 51; perl: 23
file content (106 lines) | stat: -rw-r--r-- 2,481 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/*
 * DIRB
 *
 * html2dic.c - Genera un diccionario a partir de una pagina HTML
 * Ultima modificacion: 31/03/2005
 *
 * Idea de Warezzman, coded por Darkraver
 *
 */


// (!) Aadir soporte para html en unicode

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(int argc, char **argv) {
  char uno;
  int in_tag=0;
  int in_coded=0;
  int in_word=0;
  char buffer[1024];
  FILE *fd;
  char word[]="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_";

  memset(buffer, 0, 1024);

  if(argc!=2) {
	printf("Usage: ./html2dic <file>\n");
	exit(-1); }

// Abriendo fichero

  fd=fopen(argv[1], "r");
  if(fd<=0) {
	perror("fopen");
	exit(-1); }

// Bucle de lectura de fichero -----------------------------------------------

  while(fread(&uno, 1, 1, fd)) {

    if(uno=='<') { in_tag=1; in_word=0; }

    if(uno=='&') in_coded=1;

    // Estamos en el texto

    if(!in_tag && !in_coded && uno!='\0') {
	  if(strchr(word, uno)) {
		if(!in_word) putchar('\n');
		in_word=1;
		putchar(uno);
	    }
      else in_word=0;
	  }

    // Analisis del tag html

    if(uno=='>') in_tag=0;

    // Analisis del caracter codificado

    if(in_coded && strlen(buffer)<1023) strncat(buffer, &uno, 1);

    if(uno==';') {
	  //printf("\n[ CODE: %s ]\n", buffer);
	  /*
	  if(strcmp(buffer, "&copy;")==0) putchar('');
	  if(strcmp(buffer, "&#8216;")==0) putchar('');
	  if(strcmp(buffer, "&#8217;")==0) putchar('');
	  if(strcmp(buffer, "&quot;")==0) putchar('\"');
	  if(strcmp(buffer, "&nbsp;")==0) putchar(' ');
	  if(strcmp(buffer, "&amp;")==0) putchar('&');
	  if(strcmp(buffer, "&lt;")==0) putchar('<');
	  if(strcmp(buffer, "&gt;")==0) putchar('>');
	  */
	  if(strcmp(buffer, "&ntilde;")==0) putchar('');
	  if(strcmp(buffer, "&aacute;")==0) putchar('');
	  if(strcmp(buffer, "&eacute;")==0) putchar('');
	  if(strcmp(buffer, "&iacute;")==0) putchar('');
	  if(strcmp(buffer, "&oacute;")==0) putchar('');
	  if(strcmp(buffer, "&uacute;")==0) putchar('');
	  if(strcmp(buffer, "&Aacute;")==0) putchar('');
	  if(strcmp(buffer, "&Eacute;")==0) putchar('');
	  if(strcmp(buffer, "&Iacute;")==0) putchar('');
	  if(strcmp(buffer, "&Oacute;")==0) putchar('');
	  if(strcmp(buffer, "&Uacute;")==0) putchar('');
	  if(strcmp(buffer, "&nbsp;")==0) in_word=0;
	  in_coded=0;
	  memset(buffer, 0, 1024);
      }

  }

// ---------------------------------------------------------------------------

  fclose(fd);

  exit(0);

}