reenable stripping newlines
This commit is contained in:
parent
7ad5f2d90d
commit
65d00b9c74
@ -32,30 +32,34 @@
|
|||||||
// I can see no good reason to do this, and it actually helps preview to keep
|
// I can see no good reason to do this, and it actually helps preview to keep
|
||||||
// whitespace, especially if the html comes from a filter that generated it
|
// whitespace, especially if the html comes from a filter that generated it
|
||||||
// from text (ie: inside '<pre> tags)
|
// from text (ie: inside '<pre> tags)
|
||||||
|
//
|
||||||
|
// Otoh doing it takes us closer to what the html rendering would
|
||||||
|
// be. We should actually switch on/off according to pre tags
|
||||||
void
|
void
|
||||||
MyHtmlParser::process_text(const string &text)
|
MyHtmlParser::process_text(const string &text)
|
||||||
{
|
{
|
||||||
if (!in_script_tag && !in_style_tag) {
|
if (!in_script_tag && !in_style_tag) {
|
||||||
#if 0
|
if (!in_pre_tag) {
|
||||||
string::size_type b = 0;
|
string::size_type b = 0;
|
||||||
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
|
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
|
||||||
if (pending_space || b != 0)
|
if (pending_space || b != 0)
|
||||||
if (!dump.empty()) dump += ' ';
|
if (!dump.empty())
|
||||||
pending_space = true;
|
dump += ' ';
|
||||||
string::size_type e = text.find_first_of(WHITESPACE, b);
|
pending_space = true;
|
||||||
if (e == string::npos) {
|
string::size_type e = text.find_first_of(WHITESPACE, b);
|
||||||
dump += text.substr(b);
|
if (e == string::npos) {
|
||||||
pending_space = false;
|
dump += text.substr(b);
|
||||||
break;
|
pending_space = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dump += text.substr(b, e - b);
|
||||||
|
b = e + 1;
|
||||||
}
|
}
|
||||||
dump += text.substr(b, e - b);
|
} else {
|
||||||
b = e + 1;
|
if (pending_space)
|
||||||
|
dump += ' ';
|
||||||
|
dump += text;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
if (pending_space)
|
|
||||||
dump += ' ';
|
|
||||||
dump += text;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -192,7 +196,11 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
if (tag == "ol" || tag == "option") pending_space = true;
|
if (tag == "ol" || tag == "option") pending_space = true;
|
||||||
break;
|
break;
|
||||||
case 'p':
|
case 'p':
|
||||||
if (tag == "p" || tag == "pre" || tag == "plaintext") {
|
if (tag == "p" || tag == "plaintext") {
|
||||||
|
dump += '\n';
|
||||||
|
pending_space = true;
|
||||||
|
} else if (tag == "pre") {
|
||||||
|
in_pre_tag = true;
|
||||||
dump += '\n';
|
dump += '\n';
|
||||||
pending_space = true;
|
pending_space = true;
|
||||||
}
|
}
|
||||||
@ -269,7 +277,12 @@ MyHtmlParser::closing_tag(const string &tag)
|
|||||||
if (tag == "ol" || tag == "option") pending_space = true;
|
if (tag == "ol" || tag == "option") pending_space = true;
|
||||||
break;
|
break;
|
||||||
case 'p':
|
case 'p':
|
||||||
if (tag == "p" || tag == "pre") pending_space = true;
|
if (tag == "p") {
|
||||||
|
pending_space = true;
|
||||||
|
} else if (tag == "pre") {
|
||||||
|
pending_space = true;
|
||||||
|
in_pre_tag = false;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 'q':
|
case 'q':
|
||||||
if (tag == "q") pending_space = true;
|
if (tag == "q") pending_space = true;
|
||||||
|
|||||||
@ -29,24 +29,26 @@
|
|||||||
#define WHITESPACE " \t\n\r"
|
#define WHITESPACE " \t\n\r"
|
||||||
|
|
||||||
class MyHtmlParser : public HtmlParser {
|
class MyHtmlParser : public HtmlParser {
|
||||||
public:
|
public:
|
||||||
bool in_script_tag;
|
bool in_script_tag;
|
||||||
bool in_style_tag;
|
bool in_style_tag;
|
||||||
bool in_body_tag;
|
bool in_body_tag;
|
||||||
bool pending_space;
|
bool in_pre_tag;
|
||||||
string title, sample, keywords, dump, dmtime;
|
bool pending_space;
|
||||||
string ocharset; // This is the charset our user thinks the doc was
|
string title, sample, keywords, dump, dmtime;
|
||||||
string charset; // This is the charset it was supposedly converted to
|
string ocharset; // This is the charset our user thinks the doc was
|
||||||
string doccharset; // Set this to value of charset parameter in header
|
string charset; // This is the charset it was supposedly converted to
|
||||||
bool indexing_allowed;
|
string doccharset; // Set this to value of charset parameter in header
|
||||||
void process_text(const string &text);
|
bool indexing_allowed;
|
||||||
void opening_tag(const string &tag, const map<string,string> &p);
|
void process_text(const string &text);
|
||||||
void closing_tag(const string &tag);
|
void opening_tag(const string &tag, const map<string,string> &p);
|
||||||
void do_eof();
|
void closing_tag(const string &tag);
|
||||||
MyHtmlParser() :
|
void do_eof();
|
||||||
in_script_tag(false),
|
MyHtmlParser() :
|
||||||
in_style_tag(false),
|
in_script_tag(false),
|
||||||
in_body_tag(false),
|
in_style_tag(false),
|
||||||
pending_space(false),
|
in_body_tag(false),
|
||||||
indexing_allowed(true) { }
|
in_pre_tag(false),
|
||||||
|
pending_space(false),
|
||||||
|
indexing_allowed(true) { }
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user