reenable stripping newlines

This commit is contained in:
dockes 2006-01-25 08:39:07 +00:00
parent 7ad5f2d90d
commit 65d00b9c74
2 changed files with 55 additions and 40 deletions

View File

@ -32,30 +32,34 @@
// I can see no good reason to do this, and it actually helps preview to keep // I can see no good reason to do this, and it actually helps preview to keep
// whitespace, especially if the html comes from a filter that generated it // whitespace, especially if the html comes from a filter that generated it
// from text (ie: inside '<pre> tags) // from text (ie: inside '<pre> tags)
//
// Otoh doing it takes us closer to what the html rendering would
// be. We should actually switch on/off according to pre tags
void void
MyHtmlParser::process_text(const string &text) MyHtmlParser::process_text(const string &text)
{ {
if (!in_script_tag && !in_style_tag) { if (!in_script_tag && !in_style_tag) {
#if 0 if (!in_pre_tag) {
string::size_type b = 0; string::size_type b = 0;
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) { while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
if (pending_space || b != 0) if (pending_space || b != 0)
if (!dump.empty()) dump += ' '; if (!dump.empty())
pending_space = true; dump += ' ';
string::size_type e = text.find_first_of(WHITESPACE, b); pending_space = true;
if (e == string::npos) { string::size_type e = text.find_first_of(WHITESPACE, b);
dump += text.substr(b); if (e == string::npos) {
pending_space = false; dump += text.substr(b);
break; pending_space = false;
break;
}
dump += text.substr(b, e - b);
b = e + 1;
} }
dump += text.substr(b, e - b); } else {
b = e + 1; if (pending_space)
dump += ' ';
dump += text;
} }
#else
if (pending_space)
dump += ' ';
dump += text;
#endif
} }
} }
@ -192,7 +196,11 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
if (tag == "ol" || tag == "option") pending_space = true; if (tag == "ol" || tag == "option") pending_space = true;
break; break;
case 'p': case 'p':
if (tag == "p" || tag == "pre" || tag == "plaintext") { if (tag == "p" || tag == "plaintext") {
dump += '\n';
pending_space = true;
} else if (tag == "pre") {
in_pre_tag = true;
dump += '\n'; dump += '\n';
pending_space = true; pending_space = true;
} }
@ -269,7 +277,12 @@ MyHtmlParser::closing_tag(const string &tag)
if (tag == "ol" || tag == "option") pending_space = true; if (tag == "ol" || tag == "option") pending_space = true;
break; break;
case 'p': case 'p':
if (tag == "p" || tag == "pre") pending_space = true; if (tag == "p") {
pending_space = true;
} else if (tag == "pre") {
pending_space = true;
in_pre_tag = false;
}
break; break;
case 'q': case 'q':
if (tag == "q") pending_space = true; if (tag == "q") pending_space = true;

View File

@ -29,24 +29,26 @@
#define WHITESPACE " \t\n\r" #define WHITESPACE " \t\n\r"
class MyHtmlParser : public HtmlParser { class MyHtmlParser : public HtmlParser {
public: public:
bool in_script_tag; bool in_script_tag;
bool in_style_tag; bool in_style_tag;
bool in_body_tag; bool in_body_tag;
bool pending_space; bool in_pre_tag;
string title, sample, keywords, dump, dmtime; bool pending_space;
string ocharset; // This is the charset our user thinks the doc was string title, sample, keywords, dump, dmtime;
string charset; // This is the charset it was supposedly converted to string ocharset; // This is the charset our user thinks the doc was
string doccharset; // Set this to value of charset parameter in header string charset; // This is the charset it was supposedly converted to
bool indexing_allowed; string doccharset; // Set this to value of charset parameter in header
void process_text(const string &text); bool indexing_allowed;
void opening_tag(const string &tag, const map<string,string> &p); void process_text(const string &text);
void closing_tag(const string &tag); void opening_tag(const string &tag, const map<string,string> &p);
void do_eof(); void closing_tag(const string &tag);
MyHtmlParser() : void do_eof();
in_script_tag(false), MyHtmlParser() :
in_style_tag(false), in_script_tag(false),
in_body_tag(false), in_style_tag(false),
pending_space(false), in_body_tag(false),
indexing_allowed(true) { } in_pre_tag(false),
pending_space(false),
indexing_allowed(true) { }
}; };