File: HTMLParse.c

package info (click to toggle)
r-cran-xml 3.98-1.5-1
  • links: PTS
  • area: main
  • in suites: stretch
  • size: 9,464 kB
  • ctags: 636
  • sloc: xml: 79,579; ansic: 6,518; asm: 644; sh: 16; makefile: 1
file content (161 lines) | stat: -rw-r--r-- 4,092 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/*
  This file uses the HTML parser in libxml to provide an HTML
  parser in R that is basically identical to the XML parsing interface.
  It can handle files, URLs, compressed files, and raw HTML text.
  It drops the DTD and validation options since these are not very relevant
  for HTML. (We can add put them back if anyone wants!)
 */

#include "DocParse.h"
#include "Utils.h"

#include "libxml/HTMLparser.h"
#include "libxml/HTMLtree.h"

#include <sys/stat.h>
#include <unistd.h>  

USER_OBJECT_
RS_XML(HtmlParseTree)(USER_OBJECT_ fileName, USER_OBJECT_ converterFunctions, 
                       USER_OBJECT_ skipBlankLines, USER_OBJECT_ replaceEntities,
                       USER_OBJECT_ asText, USER_OBJECT_ trim, USER_OBJECT_ isURL)
{
  const char *name;
  xmlDocPtr doc;
  USER_OBJECT_ rdoc;
  USER_OBJECT_ className;
  R_XMLSettings parserSettings;
  int freeName = 0;

  int asTextBuffer = LOGICAL_DATA(asText)[0];
  int isURLDoc = LOGICAL_DATA(isURL)[0];

  parserSettings.skipBlankLines = LOGICAL_DATA(skipBlankLines)[0];
  parserSettings.converters = converterFunctions;
  parserSettings.trim = LOGICAL_DATA(trim)[0];

  if(asTextBuffer == 0) {
    struct stat tmp_stat;  
#ifdef USE_R
    name = CHAR(STRING_ELT(fileName, 0));
#else
    name = CHARACTER_DATA(fileName)[0];
#endif
    if(!isURLDoc && (name == NULL || stat(name, &tmp_stat) < 0)) {
      PROBLEM "Can't find file %s", CHAR_DEREF(STRING_ELT(fileName, 0))
      ERROR;
    }
  } else {
     name = strdup(CHAR_DEREF(STRING_ELT(fileName, 0)));
     freeName = 1;
  }


#if 0
    /* If one wants entities expanded directly and to appear as text.  */
  if(LOGICAL_DATA(replaceEntities)[0])
    xmlSubstituteEntitiesDefault(1);   
#endif

  if(asTextBuffer) {
   doc = htmlParseDoc(CHAR_TO_XMLCHAR(name), NULL);
   if(doc != NULL) {
      doc->name = (char *) xmlStrdup(CHAR_TO_XMLCHAR("<buffer>"));
   }
  } else {
      doc = htmlParseFile(name, NULL);
  }

  if(doc == NULL) {
    if(freeName && name)
        free((char *) name);
    PROBLEM "error in creating parser for %s", name
    ERROR;
  }

  PROTECT(rdoc = RS_XML(convertXMLDoc)(name, doc, converterFunctions, &parserSettings));

  if(freeName && name)
      free((char *) name);


#if 0
  xmlFreeDoc(doc);
  R_numXMLDocsFreed++;
#endif

     /* Set the class for the document. */
  className = NEW_CHARACTER(1);
  PROTECT(className);
    SET_STRING_ELT(className, 0, mkChar("HTMLDocument"));   
    SET_CLASS(rdoc, className);
  UNPROTECT(1);


 UNPROTECT(1); 
 return(rdoc);
}




/*
  Copied from  RS_XML_printXMLNode (XMLTree.c)  with minor changes.
 */
USER_OBJECT_
RS_XML_dumpHTMLDoc(USER_OBJECT_ r_node, USER_OBJECT_ format, USER_OBJECT_ r_encoding, USER_OBJECT_ indent, USER_OBJECT_ outFile)
{
    USER_OBJECT_ ans;
    xmlDocPtr node;
    const char *encoding = NULL;
    xmlOutputBufferPtr buf;
    xmlBufferPtr xbuf;

    int oldIndent;

    oldIndent = xmlIndentTreeOutput;

    node = (xmlDocPtr) R_ExternalPtrAddr(r_node);

    xmlIndentTreeOutput =  LOGICAL(indent)[0];

#if ADD_XML_OUTPUT_BUFFER_CODE
    if(Rf_length(outFile)) {
       htmlSaveFile(CHAR_DEREF(STRING_ELT(outFile, 0)), node);
       return(R_NilValue);
    }
#endif

   
    if(GET_LENGTH(r_encoding))
	encoding = CHAR_DEREF(STRING_ELT(r_encoding, 0));

    xbuf = xmlBufferCreate();

#if 1
    buf = xmlOutputBufferCreateBuffer(xbuf, NULL);
#else
    buf = xmlOutputBufferCreateFilename("/tmp/test.out", NULL, 0);
#endif

    htmlDocContentDumpFormatOutput(buf, node, encoding, INTEGER(format)[0]);
    xmlOutputBufferFlush(buf);
    xmlIndentTreeOutput = oldIndent;

    if(xbuf->use > 0) {
        /*XXX this const char * in CHARSXP means we have to make multiple copies. */
#if 0
	char *rbuf = R_alloc(sizeof(char) * (xbuf->use + 1));
	memcpy(rbuf, xbuf->content, xbuf->use + 1);
	PROTECT(tmp = mkChar(rbuf));
#endif
	// ans = ScalarString(mkChar(xbuf->content));
	DECL_ENCODING_FROM_DOC(node)
	ans = ScalarString(ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(xbuf->content)));
    } else
      ans = NEW_CHARACTER(1);

    xmlOutputBufferClose(buf);

    return(ans);
}