comments + move the position jump to text_to_words

This commit is contained in:
Jean-Francois Dockes 2011-10-04 16:33:44 +02:00
parent e4eba0de97
commit acb297c9df

View File

@ -855,14 +855,24 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
} }
// The splitter breaks text into words and adds postings to the Xapian document. // The splitter breaks text into words and adds postings to the Xapian
// document. We use a single object to split all of the document
// fields and position jumps to separate fields
class TextSplitDb : public TextSplit { class TextSplitDb : public TextSplit {
public: public:
Xapian::WritableDatabase db; Xapian::WritableDatabase db;
Xapian::Document &doc; // Xapian document Xapian::Document &doc; // Xapian document
Xapian::termpos basepos; // Base for document section // Base for document section. Gets large increment when we change
Xapian::termpos curpos; // Current position. Used to set basepos for the // sections, to avoid cross-section proximity matches.
// following section Xapian::termpos basepos;
// Current relative position. This is the remembered value from
// the splitter callback. The term position is reset for each call
// to text_to_words(), so that the last value of curpos is the
// section size (last relative term position), and this is what
// gets added to basepos in addition to the inter-section increment
// to compute the first position of the next section.
Xapian::termpos curpos;
StopList &stops; StopList &stops;
TextSplitDb(Xapian::WritableDatabase idb, TextSplitDb(Xapian::WritableDatabase idb,
Xapian::Document &d, StopList &_stops) Xapian::Document &d, StopList &_stops)
@ -894,11 +904,13 @@ bool TextSplitDb::text_to_words(const string &in)
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
basepos += curpos + 100;
return false; return false;
} }
if (!TextSplit::text_to_words(in)) { if (!TextSplit::text_to_words(in)) {
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n")); LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
basepos += curpos + 100;
return false; return false;
} }
@ -909,8 +921,10 @@ bool TextSplitDb::text_to_words(const string &in)
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
basepos += curpos + 100;
return false; return false;
} }
basepos += curpos + 100;
return true; return true;
} }
@ -1024,7 +1038,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
if (!splitter.text_to_words(doc.utf8fn)) if (!splitter.text_to_words(doc.utf8fn))
LOGDEB(("Db::addOrUpdate: split failed for file name\n")); LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
splitter.basepos += splitter.curpos + 100;
// If the ipath is like a path, index the last element. This is // If the ipath is like a path, index the last element. This is
// for compound documents like zip and chm for which the filter // for compound documents like zip and chm for which the filter
@ -1038,7 +1051,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
if (transcode(path_getsimple(doc.ipath), utf8ipathlast, if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
"UTF-8", "UTF-8")) { "UTF-8", "UTF-8")) {
splitter.text_to_words(utf8ipathlast); splitter.text_to_words(utf8ipathlast);
splitter.basepos += splitter.curpos + 100;
} }
} }
@ -1060,8 +1072,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
splitter.basepos + splitter.curpos++); splitter.basepos + splitter.curpos++);
} }
} }
splitter.basepos += splitter.curpos + 100;
// Index textual metadata. These are all indexed as text with // Index textual metadata. These are all indexed as text with
// positions, as we may want to do phrase searches with them (this // positions, as we may want to do phrase searches with them (this
@ -1088,7 +1098,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
if (!splitter.text_to_words(meta_it->second)) if (!splitter.text_to_words(meta_it->second))
LOGDEB(("Db::addOrUpdate: split failed for %s\n", LOGDEB(("Db::addOrUpdate: split failed for %s\n",
meta_it->first.c_str())); meta_it->first.c_str()));
splitter.basepos += splitter.curpos + 100;
} }
} }
splitter.setprefix(string()); splitter.setprefix(string());
@ -1096,8 +1105,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
if (splitter.curpos < baseTextPosition) if (splitter.curpos < baseTextPosition)
splitter.basepos = baseTextPosition; splitter.basepos = baseTextPosition;
else
splitter.basepos += splitter.curpos + 100;
// Split and index body text // Split and index body text
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str())); LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));