process text from html files without a </body> tag
This commit is contained in:
parent
ae6ce2638a
commit
0122545ece
@ -1,6 +1,6 @@
|
||||
|
||||
# Only test progs in there
|
||||
PROGS = unacpp textsplit
|
||||
PROGS = internfile unacpp textsplit
|
||||
|
||||
all: $(BIGLIB) $(PROGS)
|
||||
|
||||
@ -21,6 +21,14 @@ trtextsplit.o : textsplit.cpp
|
||||
$(CXX) $(CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \
|
||||
textsplit.cpp
|
||||
|
||||
INTERNFILE_OBJS= trinternfile.o $(BIGLIB) $(MIMELIB)
|
||||
internfile : $(INTERNFILE_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o internfile $(INTERNFILE_OBJS) \
|
||||
$(LIBICONV) $(LIBSYS)
|
||||
trinternfile.o : internfile.cpp
|
||||
$(CXX) $(CXXFLAGS) -DTEST_INTERNFILE -c -o trinternfile.o \
|
||||
internfile.cpp
|
||||
|
||||
clean::
|
||||
rm -f *.o $(PROGS)
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@
|
||||
*/
|
||||
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.3 2005-11-24 07:16:15 dockes Exp $ ";
|
||||
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.4 2005-12-08 08:44:14 dockes Exp $ ";
|
||||
#endif
|
||||
|
||||
//#include <config.h>
|
||||
@ -273,10 +273,11 @@ HtmlParser::parse_html(const string &body)
|
||||
string::const_iterator start = body.begin();
|
||||
|
||||
while (1) {
|
||||
// Skip through until we find an HTML tag, a comment, or the end of
|
||||
// document. Ignore isolated occurences of `<' which don't start
|
||||
// a tag or comment
|
||||
string::const_iterator p = start;
|
||||
|
||||
// Eat text until we find an HTML tag, a comment, or the end
|
||||
// of document. Ignore isolated occurences of `<' which don't
|
||||
// start a tag or comment
|
||||
while (1) {
|
||||
p = find(p, body.end(), '<');
|
||||
if (p == body.end()) break;
|
||||
@ -286,15 +287,17 @@ HtmlParser::parse_html(const string &body)
|
||||
p++;
|
||||
}
|
||||
|
||||
|
||||
// process text up to start of tag
|
||||
if (p > start) {
|
||||
// Process text
|
||||
if (p > start || p == body.end()) {
|
||||
string text = body.substr(start - body.begin(), p - start);
|
||||
decode_entities(text);
|
||||
process_text(text);
|
||||
}
|
||||
|
||||
if (p == body.end()) break;
|
||||
if (p == body.end()) {
|
||||
do_eof();
|
||||
break;
|
||||
}
|
||||
|
||||
start = p + 1;
|
||||
|
||||
|
||||
@ -39,6 +39,7 @@ class HtmlParser {
|
||||
const map<string,string> &/*p*/) { }
|
||||
virtual void closing_tag(const string &/*tag*/) { }
|
||||
virtual void parse_html(const string &text);
|
||||
virtual void do_eof() {}
|
||||
HtmlParser();
|
||||
virtual ~HtmlParser() { }
|
||||
};
|
||||
|
||||
@ -1,6 +1,9 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.12 2005-12-06 08:35:48 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.13 2005-12-08 08:44:14 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#ifndef TEST_INTERNFILE
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
@ -178,3 +181,106 @@ FileInterner::~FileInterner()
|
||||
m_handler = 0;
|
||||
tmpcleanup();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "rclinit.h"
|
||||
#include "internfile.h"
|
||||
|
||||
static string thisprog;
|
||||
|
||||
static string usage =
|
||||
" internfile <filename> [ipath]\n"
|
||||
" \n\n"
|
||||
;
|
||||
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
cerr << thisprog << ": usage:\n" << usage;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
#define OPT_q 0x1
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
while (argc > 0 && **argv == '-') {
|
||||
(*argv)++;
|
||||
if (!(**argv))
|
||||
/* Cas du "adb - core" */
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
default: Usage(); break;
|
||||
}
|
||||
argc--; argv++;
|
||||
}
|
||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||
DebugLog::setfilename("stderr");
|
||||
|
||||
if (argc < 1)
|
||||
Usage();
|
||||
string fn(*argv++);
|
||||
argc--;
|
||||
string ipath;
|
||||
if (argc >= 1) {
|
||||
ipath.append(*argv++);
|
||||
argc--;
|
||||
}
|
||||
string reason;
|
||||
RclConfig *config = recollinit(0, 0, reason);
|
||||
|
||||
if (config == 0 || !config->ok()) {
|
||||
string str = "Configuration problem: ";
|
||||
str += reason;
|
||||
fprintf(stderr, "%s\n", str.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
FileInterner interner(fn, config, "/tmp");
|
||||
Rcl::Doc doc;
|
||||
FileInterner::Status status = interner.internfile(doc, ipath);
|
||||
switch (status) {
|
||||
case FileInterner::FIDone:
|
||||
case FileInterner::FIAgain:
|
||||
break;
|
||||
case FileInterner::FIError:
|
||||
default:
|
||||
fprintf(stderr, "internfile failed\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
cout << "doc.url [[[[" << doc.url <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.ipath [[[[" << doc.ipath <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.mimetype [[[[" << doc.mimetype <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.fmtime [[[[" << doc.fmtime <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.dmtime [[[[" << doc.dmtime <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.origcharset [[[[" << doc.origcharset <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.title [[[[" << doc.title <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.keywords [[[[" << doc.keywords <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.abstract [[[[" << doc.abstract <<
|
||||
"]]]]\n-----------------------------------------------------\n" <<
|
||||
"doc.text [[[[" << doc.text << "]]]]\n";
|
||||
}
|
||||
|
||||
#endif // TEST_INTERNFILE
|
||||
|
||||
@ -80,10 +80,10 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
|
||||
// instead of the configuration one.
|
||||
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
|
||||
|
||||
MyHtmlParser pres;
|
||||
MyHtmlParser result;
|
||||
for (int pass = 0; pass < 2; pass++) {
|
||||
string transcoded;
|
||||
|
||||
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
||||
MyHtmlParser p;
|
||||
// Try transcoding. If it fails, use original text.
|
||||
if (!transcode(htext, transcoded, charset, "UTF-8")) {
|
||||
@ -101,17 +101,21 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
|
||||
|
||||
try {
|
||||
p.parse_html(transcoded);
|
||||
// No exception: ok?
|
||||
result = p;
|
||||
break;
|
||||
} catch (bool diag) {
|
||||
pres = p;
|
||||
result = p;
|
||||
if (diag == true)
|
||||
break;
|
||||
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
||||
charset.c_str(),pres.doccharset.c_str()));
|
||||
if (!pres.doccharset.empty() &&
|
||||
!samecharset(pres.doccharset, pres.ocharset)) {
|
||||
charset.c_str(),result.doccharset.c_str()));
|
||||
if (!result.doccharset.empty() &&
|
||||
!samecharset(result.doccharset, result.ocharset)) {
|
||||
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
|
||||
"reparse\n", charset.c_str(),pres.doccharset.c_str()));
|
||||
charset = pres.doccharset;
|
||||
"reparse\n", charset.c_str(),
|
||||
result.doccharset.c_str()));
|
||||
charset = result.doccharset;
|
||||
} else {
|
||||
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
||||
return MimeHandler::MHError;
|
||||
@ -120,11 +124,11 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
|
||||
}
|
||||
|
||||
docout.origcharset = charset;
|
||||
docout.text = pres.dump;
|
||||
//LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
|
||||
docout.title = pres.title;
|
||||
docout.keywords = pres.keywords;
|
||||
docout.abstract = pres.sample;
|
||||
docout.dmtime = pres.dmtime;
|
||||
docout.text = result.dump;
|
||||
//LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str()));
|
||||
docout.title = result.title;
|
||||
docout.keywords = result.keywords;
|
||||
docout.abstract = result.sample;
|
||||
docout.dmtime = result.dmtime;
|
||||
return MimeHandler::MHDone;
|
||||
}
|
||||
|
||||
@ -77,6 +77,7 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
case 'b':
|
||||
if (tag == "body") {
|
||||
dump = "";
|
||||
in_body_tag = true;
|
||||
break;
|
||||
}
|
||||
if (tag == "blockquote" || tag == "br") {
|
||||
@ -234,6 +235,7 @@ MyHtmlParser::closing_tag(const string &tag)
|
||||
case 'b':
|
||||
if (tag == "body") {
|
||||
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
||||
in_body_tag = false;
|
||||
throw true;
|
||||
}
|
||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||
@ -302,3 +304,14 @@ MyHtmlParser::closing_tag(const string &tag)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// This gets called when hitting eof. If the <body> is open, do
|
||||
// something with the text (that is, don't throw up). Else, things are
|
||||
// too weird, throw an error. We don't get called if the parser finds
|
||||
// a closing body tag (exception gets thrown by closing_tag())
|
||||
void
|
||||
MyHtmlParser::do_eof()
|
||||
{
|
||||
if (!in_body_tag)
|
||||
throw(false);
|
||||
}
|
||||
|
||||
@ -32,6 +32,7 @@ class MyHtmlParser : public HtmlParser {
|
||||
public:
|
||||
bool in_script_tag;
|
||||
bool in_style_tag;
|
||||
bool in_body_tag;
|
||||
bool pending_space;
|
||||
string title, sample, keywords, dump, dmtime;
|
||||
string ocharset; // This is the charset our user thinks the doc was
|
||||
@ -41,9 +42,11 @@ class MyHtmlParser : public HtmlParser {
|
||||
void process_text(const string &text);
|
||||
void opening_tag(const string &tag, const map<string,string> &p);
|
||||
void closing_tag(const string &tag);
|
||||
void do_eof();
|
||||
MyHtmlParser() :
|
||||
in_script_tag(false),
|
||||
in_style_tag(false),
|
||||
in_body_tag(false),
|
||||
pending_space(false),
|
||||
indexing_allowed(true) { }
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user