process text from html files without a </body> tag

This commit is contained in:
dockes 2005-12-08 08:44:14 +00:00
parent ae6ce2638a
commit 0122545ece
7 changed files with 162 additions and 24 deletions

View File

@ -1,6 +1,6 @@
# Only test progs in there
PROGS = unacpp textsplit
PROGS = internfile unacpp textsplit
all: $(BIGLIB) $(PROGS)
@ -21,6 +21,14 @@ trtextsplit.o : textsplit.cpp
$(CXX) $(CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \
textsplit.cpp
INTERNFILE_OBJS= trinternfile.o $(BIGLIB) $(MIMELIB)
internfile : $(INTERNFILE_OBJS)
$(CXX) $(CXXFLAGS) -o internfile $(INTERNFILE_OBJS) \
$(LIBICONV) $(LIBSYS)
trinternfile.o : internfile.cpp
$(CXX) $(CXXFLAGS) -DTEST_INTERNFILE -c -o trinternfile.o \
internfile.cpp
clean::
rm -f *.o $(PROGS)

View File

@ -23,7 +23,7 @@
*/
#ifndef lint
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.3 2005-11-24 07:16:15 dockes Exp $ ";
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.4 2005-12-08 08:44:14 dockes Exp $ ";
#endif
//#include <config.h>
@ -273,10 +273,11 @@ HtmlParser::parse_html(const string &body)
string::const_iterator start = body.begin();
while (1) {
// Skip through until we find an HTML tag, a comment, or the end of
// document. Ignore isolated occurences of `<' which don't start
// a tag or comment
string::const_iterator p = start;
// Eat text until we find an HTML tag, a comment, or the end
// of document. Ignore isolated occurences of `<' which don't
// start a tag or comment
while (1) {
p = find(p, body.end(), '<');
if (p == body.end()) break;
@ -286,15 +287,17 @@ HtmlParser::parse_html(const string &body)
p++;
}
// process text up to start of tag
if (p > start) {
// Process text
if (p > start || p == body.end()) {
string text = body.substr(start - body.begin(), p - start);
decode_entities(text);
process_text(text);
}
if (p == body.end()) break;
if (p == body.end()) {
do_eof();
break;
}
start = p + 1;

View File

@ -39,6 +39,7 @@ class HtmlParser {
const map<string,string> &/*p*/) { }
virtual void closing_tag(const string &/*tag*/) { }
virtual void parse_html(const string &text);
virtual void do_eof() {}
HtmlParser();
virtual ~HtmlParser() { }
};

View File

@ -1,6 +1,9 @@
#ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.12 2005-12-06 08:35:48 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.13 2005-12-08 08:44:14 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_INTERNFILE
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
@ -178,3 +181,106 @@ FileInterner::~FileInterner()
m_handler = 0;
tmpcleanup();
}
#else
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
using namespace std;
#include "debuglog.h"
#include "rclinit.h"
#include "internfile.h"
static string thisprog;
static string usage =
" internfile <filename> [ipath]\n"
" \n\n"
;
static void
Usage(void)
{
cerr << thisprog << ": usage:\n" << usage;
exit(1);
}
static int op_flags;
#define OPT_q 0x1
int main(int argc, char **argv)
{
thisprog = argv[0];
argc--; argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
Usage();
while (**argv)
switch (*(*argv)++) {
default: Usage(); break;
}
argc--; argv++;
}
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr");
if (argc < 1)
Usage();
string fn(*argv++);
argc--;
string ipath;
if (argc >= 1) {
ipath.append(*argv++);
argc--;
}
string reason;
RclConfig *config = recollinit(0, 0, reason);
if (config == 0 || !config->ok()) {
string str = "Configuration problem: ";
str += reason;
fprintf(stderr, "%s\n", str.c_str());
exit(1);
}
FileInterner interner(fn, config, "/tmp");
Rcl::Doc doc;
FileInterner::Status status = interner.internfile(doc, ipath);
switch (status) {
case FileInterner::FIDone:
case FileInterner::FIAgain:
break;
case FileInterner::FIError:
default:
fprintf(stderr, "internfile failed\n");
exit(1);
}
cout << "doc.url [[[[" << doc.url <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.ipath [[[[" << doc.ipath <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.mimetype [[[[" << doc.mimetype <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.fmtime [[[[" << doc.fmtime <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.dmtime [[[[" << doc.dmtime <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.origcharset [[[[" << doc.origcharset <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.title [[[[" << doc.title <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.keywords [[[[" << doc.keywords <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.abstract [[[[" << doc.abstract <<
"]]]]\n-----------------------------------------------------\n" <<
"doc.text [[[[" << doc.text << "]]]]\n";
}
#endif // TEST_INTERNFILE

View File

@ -80,10 +80,10 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
// instead of the configuration one.
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
MyHtmlParser pres;
MyHtmlParser result;
for (int pass = 0; pass < 2; pass++) {
string transcoded;
LOGDEB(("Html::mkDoc: pass %d\n", pass));
MyHtmlParser p;
// Try transcoding. If it fails, use original text.
if (!transcode(htext, transcoded, charset, "UTF-8")) {
@ -101,17 +101,21 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
try {
p.parse_html(transcoded);
// No exception: ok?
result = p;
break;
} catch (bool diag) {
pres = p;
result = p;
if (diag == true)
break;
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
charset.c_str(),pres.doccharset.c_str()));
if (!pres.doccharset.empty() &&
!samecharset(pres.doccharset, pres.ocharset)) {
charset.c_str(),result.doccharset.c_str()));
if (!result.doccharset.empty() &&
!samecharset(result.doccharset, result.ocharset)) {
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
"reparse\n", charset.c_str(),pres.doccharset.c_str()));
charset = pres.doccharset;
"reparse\n", charset.c_str(),
result.doccharset.c_str()));
charset = result.doccharset;
} else {
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
return MimeHandler::MHError;
@ -120,11 +124,11 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
}
docout.origcharset = charset;
docout.text = pres.dump;
//LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
docout.title = pres.title;
docout.keywords = pres.keywords;
docout.abstract = pres.sample;
docout.dmtime = pres.dmtime;
docout.text = result.dump;
//LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str()));
docout.title = result.title;
docout.keywords = result.keywords;
docout.abstract = result.sample;
docout.dmtime = result.dmtime;
return MimeHandler::MHDone;
}

View File

@ -77,6 +77,7 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
case 'b':
if (tag == "body") {
dump = "";
in_body_tag = true;
break;
}
if (tag == "blockquote" || tag == "br") {
@ -234,6 +235,7 @@ MyHtmlParser::closing_tag(const string &tag)
case 'b':
if (tag == "body") {
LOGDEB1(("Myhtmlparse: body close tag found\n"));
in_body_tag = false;
throw true;
}
if (tag == "blockquote" || tag == "br") pending_space = true;
@ -302,3 +304,14 @@ MyHtmlParser::closing_tag(const string &tag)
break;
}
}
// This gets called when hitting eof. If the <body> is open, do
// something with the text (that is, don't throw up). Else, things are
// too weird, throw an error. We don't get called if the parser finds
// a closing body tag (exception gets thrown by closing_tag())
void
MyHtmlParser::do_eof()
{
if (!in_body_tag)
throw(false);
}

View File

@ -32,6 +32,7 @@ class MyHtmlParser : public HtmlParser {
public:
bool in_script_tag;
bool in_style_tag;
bool in_body_tag;
bool pending_space;
string title, sample, keywords, dump, dmtime;
string ocharset; // This is the charset our user thinks the doc was
@ -41,9 +42,11 @@ class MyHtmlParser : public HtmlParser {
void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p);
void closing_tag(const string &tag);
void do_eof();
MyHtmlParser() :
in_script_tag(false),
in_style_tag(false),
in_body_tag(false),
pending_space(false),
indexing_allowed(true) { }
};