1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
|
/*
* DIRB
*
* html2dic.c - Genera un diccionario a partir de una pagina HTML
* Ultima modificacion: 31/03/2005
*
* Idea de Warezzman, coded por Darkraver
*
*/
// (!) Aadir soporte para html en unicode
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
char uno;
int in_tag=0;
int in_coded=0;
int in_word=0;
char buffer[1024];
FILE *fd;
char word[]="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_";
memset(buffer, 0, 1024);
if(argc!=2) {
printf("Usage: ./html2dic <file>\n");
exit(-1); }
// Abriendo fichero
fd=fopen(argv[1], "r");
if(fd<=0) {
perror("fopen");
exit(-1); }
// Bucle de lectura de fichero -----------------------------------------------
while(fread(&uno, 1, 1, fd)) {
if(uno=='<') { in_tag=1; in_word=0; }
if(uno=='&') in_coded=1;
// Estamos en el texto
if(!in_tag && !in_coded && uno!='\0') {
if(strchr(word, uno)) {
if(!in_word) putchar('\n');
in_word=1;
putchar(uno);
}
else in_word=0;
}
// Analisis del tag html
if(uno=='>') in_tag=0;
// Analisis del caracter codificado
if(in_coded && strlen(buffer)<1023) strncat(buffer, &uno, 1);
if(uno==';') {
//printf("\n[ CODE: %s ]\n", buffer);
/*
if(strcmp(buffer, "©")==0) putchar('');
if(strcmp(buffer, "‘")==0) putchar('');
if(strcmp(buffer, "’")==0) putchar('');
if(strcmp(buffer, """)==0) putchar('\"');
if(strcmp(buffer, " ")==0) putchar(' ');
if(strcmp(buffer, "&")==0) putchar('&');
if(strcmp(buffer, "<")==0) putchar('<');
if(strcmp(buffer, ">")==0) putchar('>');
*/
if(strcmp(buffer, "ñ")==0) putchar('');
if(strcmp(buffer, "á")==0) putchar('');
if(strcmp(buffer, "é")==0) putchar('');
if(strcmp(buffer, "í")==0) putchar('');
if(strcmp(buffer, "ó")==0) putchar('');
if(strcmp(buffer, "ú")==0) putchar('');
if(strcmp(buffer, "Á")==0) putchar('');
if(strcmp(buffer, "É")==0) putchar('');
if(strcmp(buffer, "Í")==0) putchar('');
if(strcmp(buffer, "Ó")==0) putchar('');
if(strcmp(buffer, "Ú")==0) putchar('');
if(strcmp(buffer, " ")==0) in_word=0;
in_coded=0;
memset(buffer, 0, 1024);
}
}
// ---------------------------------------------------------------------------
fclose(fd);
exit(0);
}
|