1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
|
/* Boost.Flyweight example of flyweight-based formatted text processing.
*
* Copyright 2006-2023 Joaquin M Lopez Munoz.
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*
* See http://www.boost.org/libs/flyweight for library home page.
*/
#include <boost/flyweight.hpp>
#include <boost/functional/hash.hpp>
#include <algorithm>
#include <cctype>
#include <cstdio>
#include <fstream>
#include <iostream>
#include <iterator>
#include <sstream>
#include <string>
#include <vector>
#if defined(BOOST_NO_STDC_NAMESPACE)
namespace std{using ::exit;using ::tolower;}
#endif
using namespace boost::flyweights;
/* An HTML tag consists of a name and optional properties of the form
* name1=value1 ... namen=valuen. We do not need to parse the properties
* for the purposes of the program, hence they are all stored in
* html_tag_data::properties in raw form.
*/
struct html_tag_data
{
std::string name;
std::string properties;
};
bool operator==(const html_tag_data& x,const html_tag_data& y)
{
return x.name==y.name&&x.properties==y.properties;
}
/* See the portability section of Boost.Hash at
* http://boost.org/doc/html/hash/portability.html
* for an explanation of the ADL-related workarounds.
*/
#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
namespace boost{
#endif
std::size_t hash_value(const html_tag_data& x)
{
std::size_t res=0;
boost::hash_combine(res,x.name);
boost::hash_combine(res,x.properties);
return res;
}
#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
} /* namespace boost */
#endif
typedef flyweight<html_tag_data> html_tag;
/* parse_tag is passed an iterator positioned at the first char of
* the tag after the opening '<' and returns, if succesful, a parsed tag
* and whether it is opening (<xx>) or closing (</xx>).
*/
enum tag_type{opening,closing,failure};
struct parse_tag_res
{
parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()):
type(type_),tag(tag_){}
parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){}
tag_type type;
html_tag tag;
};
template<typename ForwardIterator>
parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last)
{
html_tag_data tag;
std::string buf;
bool in_quote=false;
for(ForwardIterator it=first;it!=last;){
char ch=*it++;
if(ch=='>'&&!in_quote){ /* ignore '>'s if inside quotes */
tag_type type;
std::string::size_type
bname=buf.find_first_not_of("\t\n\r "),
ename=bname==std::string::npos?
std::string::npos:
buf.find_first_of("\t\n\r ",bname),
bprop=ename==std::string::npos?
std::string::npos:
buf.find_first_not_of("\t\n\r ",ename);
if(bname==ename){ /* null name */
return parse_tag_res(failure);
}
else if(buf[bname]=='/'){ /* closing tag */
type=closing;
++bname;
}
else type=opening;
tag.name=buf.substr(bname,ename-bname);
std::transform( /* normalize tag name to lower case */
tag.name.begin(),tag.name.end(),tag.name.begin(),
(int(*)(int))std::tolower);
if(bprop!=std::string::npos){
tag.properties=buf.substr(bprop,buf.size());
}
first=it; /* result good, consume the chars */
return parse_tag_res(type,tag);
}
else{
if(ch=='"')in_quote=!in_quote;
buf+=ch;
}
}
return parse_tag_res(failure); /* end reached and found no '>' */
}
/* A character context is just a vector containing the tags enclosing the
* character, from the outermost level to the innermost.
*/
typedef std::vector<html_tag> html_context_data;
typedef flyweight<html_context_data> html_context;
/* A character is a char code plus its context.
*/
struct character_data
{
character_data(char code_=0,html_context context_=html_context()):
code(code_),context(context_){}
character_data(const character_data& x):code(x.code),context(x.context){}
char code;
html_context context;
};
bool operator==(const character_data& x,const character_data& y)
{
return x.code==y.code&&x.context==y.context;
}
#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
namespace boost{
#endif
std::size_t hash_value(const character_data& x)
{
std::size_t res=0;
boost::hash_combine(res,x.code);
boost::hash_combine(res,x.context);
return res;
}
#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
} /* namespace boost */
#endif
typedef flyweight<character_data> character;
/* scan_html converts HTML code into a stream of contextualized characters.
*/
template<typename ForwardIterator,typename OutputIterator>
void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
{
html_context_data context;
while(first!=last){
if(*first=='<'){ /* tag found */
++first;
parse_tag_res res=parse_tag(first,last);
if(res.type==opening){ /* add to contex */
context.push_back(res.tag);
continue;
}
else if(res.type==closing){ /* remove from context */
/* Pop all tags from the innermost to the matching one; this takes
* care of missing </xx>s like vg. in <ul><li>hello</ul>.
*/
for(html_context_data::reverse_iterator rit=context.rbegin();
rit!=context.rend();++rit){
if(rit->get().name==res.tag->name){
context.erase(rit.base()-1,context.end());
break;
}
}
continue;
}
}
*out++=character(*first++,html_context(context));
}
}
/* HTML-producing utilities */
void print_opening_tag(std::ostream& os,const html_tag_data& x)
{
os<<"<"<<x.name;
if(!x.properties.empty())os<<" "<<x.properties;
os<<">";
}
void print_closing_tag(std::ostream& os,const html_tag_data& x)
{
/* SGML declarations (beginning with '!') are not closed */
if(x.name[0]!='!')os<<"</"<<x.name<<">";
}
/* change_context takes contexts from and to with tags
*
* from<- c1 ... cn fn+1 ... fm
* to <- c1 ... cn tn+1 ... tk
*
* (that is, they share the first n tags, n might be 0), and
* produces code closing fm ... fn+1 and opening tn+1 ... tk.
*/
template<typename OutputIterator>
void change_context(
const html_context_data& from,const html_context_data& to,
OutputIterator out)
{
std::ostringstream oss;
html_context_data::const_iterator
it0=from.begin(),
it0_end=from.end(),
it1=to.begin(),
it1_end=to.end();
for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1);
while(it0_end!=it0)print_closing_tag(oss,*--it0_end);
while(it1!=it1_end)print_opening_tag(oss,*it1++);
std::string str=oss.str();
std::copy(str.begin(),str.end(),out);
}
/* produce_html is passed a bunch of contextualized characters and emits
* the corresponding HTML. The algorithm is simple: tags are opened and closed
* as a result of the context from one character to the following changing.
*/
template<typename ForwardIterator,typename OutputIterator>
void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
{
html_context context;
while(first!=last){
if(first->get().context!=context){
change_context(context,first->get().context,out);
context=first->get().context;
}
*out++=(first++)->get().code;
}
change_context(context,html_context(),out); /* close remaining context */
}
/* Without these explicit instantiations, MSVC++ 6.5/7.0 does not
* find some friend operators in certain contexts.
*/
character dummy1;
html_tag dummy2;
int main()
{
std::cout<<"input html file: ";
std::string in;
std::getline(std::cin,in);
std::ifstream ifs(in.c_str());
if(!ifs){
std::cout<<"can't open "<<in<<std::endl;
std::exit(EXIT_FAILURE);
}
typedef std::istreambuf_iterator<char> istrbuf_iterator;
std::vector<char> html_source;
std::copy(
istrbuf_iterator(ifs),istrbuf_iterator(),
std::back_inserter(html_source));
/* parse the HTML */
std::vector<character> scanned_html;
scan_html(
html_source.begin(),html_source.end(),std::back_inserter(scanned_html));
/* Now that we have the text as a vector of contextualized characters,
* we can shuffle it around and manipulate in almost any way we please.
* For instance, the following reverses the central portion of the doc.
*/
std::reverse(
scanned_html.begin()+scanned_html.size()/4,
scanned_html.begin()+3*(scanned_html.size()/4));
/* emit the resulting HTML */
std::cout<<"output html file: ";
std::string out;
std::getline(std::cin,out);
std::ofstream ofs(out.c_str());
if(!ofs){
std::cout<<"can't open "<<out<<std::endl;
std::exit(EXIT_FAILURE);
}
typedef std::ostreambuf_iterator<char> ostrbuf_iterator;
produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs));
return 0;
}
|