diff --git a/src/common/Makefile b/src/common/Makefile index 53e1e6d1..032c6cda 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -1,6 +1,6 @@ # Only test progs in there -PROGS = unacpp textsplit +PROGS = internfile unacpp textsplit all: $(BIGLIB) $(PROGS) @@ -21,6 +21,14 @@ trtextsplit.o : textsplit.cpp $(CXX) $(CXXFLAGS) -DTEST_TEXTSPLIT -c -o trtextsplit.o \ textsplit.cpp +INTERNFILE_OBJS= trinternfile.o $(BIGLIB) $(MIMELIB) +internfile : $(INTERNFILE_OBJS) + $(CXX) $(CXXFLAGS) -o internfile $(INTERNFILE_OBJS) \ + $(LIBICONV) $(LIBSYS) +trinternfile.o : internfile.cpp + $(CXX) $(CXXFLAGS) -DTEST_INTERNFILE -c -o trinternfile.o \ + internfile.cpp + clean:: rm -f *.o $(PROGS) diff --git a/src/internfile/htmlparse.cpp b/src/internfile/htmlparse.cpp index 37ea066d..de1756a9 100644 --- a/src/internfile/htmlparse.cpp +++ b/src/internfile/htmlparse.cpp @@ -23,7 +23,7 @@ */ #ifndef lint -static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.3 2005-11-24 07:16:15 dockes Exp $ "; +static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.4 2005-12-08 08:44:14 dockes Exp $ "; #endif //#include @@ -273,10 +273,11 @@ HtmlParser::parse_html(const string &body) string::const_iterator start = body.begin(); while (1) { - // Skip through until we find an HTML tag, a comment, or the end of - // document. Ignore isolated occurences of `<' which don't start - // a tag or comment string::const_iterator p = start; + + // Eat text until we find an HTML tag, a comment, or the end + // of document. Ignore isolated occurences of `<' which don't + // start a tag or comment while (1) { p = find(p, body.end(), '<'); if (p == body.end()) break; @@ -286,15 +287,17 @@ HtmlParser::parse_html(const string &body) p++; } - - // process text up to start of tag - if (p > start) { + // Process text + if (p > start || p == body.end()) { string text = body.substr(start - body.begin(), p - start); decode_entities(text); process_text(text); } - if (p == body.end()) break; + if (p == body.end()) { + do_eof(); + break; + } start = p + 1; diff --git a/src/internfile/htmlparse.h b/src/internfile/htmlparse.h index 20da34f0..d8db6e00 100644 --- a/src/internfile/htmlparse.h +++ b/src/internfile/htmlparse.h @@ -39,6 +39,7 @@ class HtmlParser { const map &/*p*/) { } virtual void closing_tag(const string &/*tag*/) { } virtual void parse_html(const string &text); + virtual void do_eof() {} HtmlParser(); virtual ~HtmlParser() { } }; diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index aa938b38..30844287 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,6 +1,9 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.12 2005-12-06 08:35:48 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.13 2005-12-08 08:44:14 dockes Exp $ (C) 2004 J.F.Dockes"; #endif + +#ifndef TEST_INTERNFILE + #include #include #include @@ -178,3 +181,106 @@ FileInterner::~FileInterner() m_handler = 0; tmpcleanup(); } + +#else + +#include +#include +#include +#include +using namespace std; + +#include "debuglog.h" +#include "rclinit.h" +#include "internfile.h" + +static string thisprog; + +static string usage = + " internfile [ipath]\n" + " \n\n" + ; + +static void +Usage(void) +{ + cerr << thisprog << ": usage:\n" << usage; + exit(1); +} + +static int op_flags; +#define OPT_q 0x1 + +int main(int argc, char **argv) +{ + thisprog = argv[0]; + argc--; argv++; + + while (argc > 0 && **argv == '-') { + (*argv)++; + if (!(**argv)) + /* Cas du "adb - core" */ + Usage(); + while (**argv) + switch (*(*argv)++) { + default: Usage(); break; + } + argc--; argv++; + } + DebugLog::getdbl()->setloglevel(DEBDEB1); + DebugLog::setfilename("stderr"); + + if (argc < 1) + Usage(); + string fn(*argv++); + argc--; + string ipath; + if (argc >= 1) { + ipath.append(*argv++); + argc--; + } + string reason; + RclConfig *config = recollinit(0, 0, reason); + + if (config == 0 || !config->ok()) { + string str = "Configuration problem: "; + str += reason; + fprintf(stderr, "%s\n", str.c_str()); + exit(1); + } + + FileInterner interner(fn, config, "/tmp"); + Rcl::Doc doc; + FileInterner::Status status = interner.internfile(doc, ipath); + switch (status) { + case FileInterner::FIDone: + case FileInterner::FIAgain: + break; + case FileInterner::FIError: + default: + fprintf(stderr, "internfile failed\n"); + exit(1); + } + + cout << "doc.url [[[[" << doc.url << + "]]]]\n-----------------------------------------------------\n" << + "doc.ipath [[[[" << doc.ipath << + "]]]]\n-----------------------------------------------------\n" << + "doc.mimetype [[[[" << doc.mimetype << + "]]]]\n-----------------------------------------------------\n" << + "doc.fmtime [[[[" << doc.fmtime << + "]]]]\n-----------------------------------------------------\n" << + "doc.dmtime [[[[" << doc.dmtime << + "]]]]\n-----------------------------------------------------\n" << + "doc.origcharset [[[[" << doc.origcharset << + "]]]]\n-----------------------------------------------------\n" << + "doc.title [[[[" << doc.title << + "]]]]\n-----------------------------------------------------\n" << + "doc.keywords [[[[" << doc.keywords << + "]]]]\n-----------------------------------------------------\n" << + "doc.abstract [[[[" << doc.abstract << + "]]]]\n-----------------------------------------------------\n" << + "doc.text [[[[" << doc.text << "]]]]\n"; +} + +#endif // TEST_INTERNFILE diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 56324a46..d6fa8403 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -80,10 +80,10 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &, // instead of the configuration one. LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str())); - MyHtmlParser pres; + MyHtmlParser result; for (int pass = 0; pass < 2; pass++) { string transcoded; - + LOGDEB(("Html::mkDoc: pass %d\n", pass)); MyHtmlParser p; // Try transcoding. If it fails, use original text. if (!transcode(htext, transcoded, charset, "UTF-8")) { @@ -101,17 +101,21 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &, try { p.parse_html(transcoded); + // No exception: ok? + result = p; + break; } catch (bool diag) { - pres = p; + result = p; if (diag == true) break; LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", - charset.c_str(),pres.doccharset.c_str())); - if (!pres.doccharset.empty() && - !samecharset(pres.doccharset, pres.ocharset)) { + charset.c_str(),result.doccharset.c_str())); + if (!result.doccharset.empty() && + !samecharset(result.doccharset, result.ocharset)) { LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s'," - "reparse\n", charset.c_str(),pres.doccharset.c_str())); - charset = pres.doccharset; + "reparse\n", charset.c_str(), + result.doccharset.c_str())); + charset = result.doccharset; } else { LOGERR(("textHtmlToDoc:: error: non charset exception\n")); return MimeHandler::MHError; @@ -120,11 +124,11 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &, } docout.origcharset = charset; - docout.text = pres.dump; - //LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str())); - docout.title = pres.title; - docout.keywords = pres.keywords; - docout.abstract = pres.sample; - docout.dmtime = pres.dmtime; + docout.text = result.dump; + //LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str())); + docout.title = result.title; + docout.keywords = result.keywords; + docout.abstract = result.sample; + docout.dmtime = result.dmtime; return MimeHandler::MHDone; } diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index aca5cf35..2e1b17e1 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -77,6 +77,7 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) case 'b': if (tag == "body") { dump = ""; + in_body_tag = true; break; } if (tag == "blockquote" || tag == "br") { @@ -234,6 +235,7 @@ MyHtmlParser::closing_tag(const string &tag) case 'b': if (tag == "body") { LOGDEB1(("Myhtmlparse: body close tag found\n")); + in_body_tag = false; throw true; } if (tag == "blockquote" || tag == "br") pending_space = true; @@ -302,3 +304,14 @@ MyHtmlParser::closing_tag(const string &tag) break; } } + +// This gets called when hitting eof. If the is open, do +// something with the text (that is, don't throw up). Else, things are +// too weird, throw an error. We don't get called if the parser finds +// a closing body tag (exception gets thrown by closing_tag()) +void +MyHtmlParser::do_eof() +{ + if (!in_body_tag) + throw(false); +} diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index 441217fc..5f4a0c89 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -32,6 +32,7 @@ class MyHtmlParser : public HtmlParser { public: bool in_script_tag; bool in_style_tag; + bool in_body_tag; bool pending_space; string title, sample, keywords, dump, dmtime; string ocharset; // This is the charset our user thinks the doc was @@ -41,9 +42,11 @@ class MyHtmlParser : public HtmlParser { void process_text(const string &text); void opening_tag(const string &tag, const map &p); void closing_tag(const string &tag); + void do_eof(); MyHtmlParser() : in_script_tag(false), in_style_tag(false), + in_body_tag(false), pending_space(false), indexing_allowed(true) { } };