whitespace and indents only

This commit is contained in:
Jean-Francois Dockes 2020-04-14 09:25:13 +02:00
parent d43bb992f7
commit 5dd8774b3c
3 changed files with 963 additions and 966 deletions

View File

@ -41,7 +41,7 @@ namespace Rcl {
* We use Xapian synonyms subsets to store the expansions. * We use Xapian synonyms subsets to store the expansions.
*/ */
bool createExpansionDbs(Xapian::WritableDatabase& wdb, bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const vector<string>& langs) const vector<string>& langs)
{ {
LOGDEB("StemDb::createExpansionDbs: languages: " <<stringsToString(langs) << "\n"); LOGDEB("StemDb::createExpansionDbs: languages: " <<stringsToString(langs) << "\n");
Chrono cron; Chrono cron;
@ -51,99 +51,99 @@ bool createExpansionDbs(Xapian::WritableDatabase& wdb,
// If langs is empty and we don't need casediac expansion, then no need to // If langs is empty and we don't need casediac expansion, then no need to
// walk the big list // walk the big list
if (langs.empty()) { if (langs.empty()) {
if (o_index_stripchars) if (o_index_stripchars)
return true; return true;
} }
// Walk the list of all terms, and stem/unac each. // Walk the list of all terms, and stem/unac each.
string ermsg; string ermsg;
try { try {
// Stem dbs // Stem dbs
vector<XapWritableComputableSynFamMember> stemdbs; vector<XapWritableComputableSynFamMember> stemdbs;
// Note: tried to make this to work with stack-allocated objects, couldn't. // Note: tried to make this to work with stack-allocated objects, couldn't.
// Looks like a bug in copy constructors somewhere, can't guess where // Looks like a bug in copy constructors somewhere, can't guess where
vector<std::shared_ptr<SynTermTransStem> > stemmers; vector<std::shared_ptr<SynTermTransStem> > stemmers;
for (unsigned int i = 0; i < langs.size(); i++) { for (unsigned int i = 0; i < langs.size(); i++) {
stemmers.push_back(std::shared_ptr<SynTermTransStem> stemmers.push_back(std::shared_ptr<SynTermTransStem>
(new SynTermTransStem(langs[i]))); (new SynTermTransStem(langs[i])));
stemdbs.push_back( stemdbs.push_back(
XapWritableComputableSynFamMember(wdb, synFamStem, langs[i], XapWritableComputableSynFamMember(wdb, synFamStem, langs[i],
stemmers.back().get())); stemmers.back().get()));
stemdbs.back().recreate(); stemdbs.back().recreate();
} }
// Unaccented stem dbs // Unaccented stem dbs
vector<XapWritableComputableSynFamMember> unacstemdbs; vector<XapWritableComputableSynFamMember> unacstemdbs;
// We can reuse the same stemmer pointers, the objects are stateless. // We can reuse the same stemmer pointers, the objects are stateless.
if (!o_index_stripchars) { if (!o_index_stripchars) {
for (unsigned int i = 0; i < langs.size(); i++) { for (unsigned int i = 0; i < langs.size(); i++) {
unacstemdbs.push_back( unacstemdbs.push_back(
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i], XapWritableComputableSynFamMember(
stemmers.back().get())); wdb, synFamStemUnac, langs[i], stemmers.back().get()));
unacstemdbs.back().recreate(); unacstemdbs.back().recreate();
} }
} }
SynTermTransUnac transunac(UNACOP_UNACFOLD); SynTermTransUnac transunac(UNACOP_UNACFOLD);
XapWritableComputableSynFamMember XapWritableComputableSynFamMember
diacasedb(wdb, synFamDiCa, "all", &transunac); diacasedb(wdb, synFamDiCa, "all", &transunac);
if (!o_index_stripchars) if (!o_index_stripchars)
diacasedb.recreate(); diacasedb.recreate();
Xapian::TermIterator it = wdb.allterms_begin(); Xapian::TermIterator it = wdb.allterms_begin();
// We'd want to skip to the first non-prefixed term, but this is a bit // We'd want to skip to the first non-prefixed term, but this is a bit
// complicated, so we just jump over most of the prefixed term and then // complicated, so we just jump over most of the prefixed term and then
// skip the rest one by one. // skip the rest one by one.
it.skip_to(wrap_prefix("Z")); it.skip_to(wrap_prefix("Z"));
for ( ;it != wdb.allterms_end(); it++) { for ( ;it != wdb.allterms_end(); it++) {
const string term{*it}; const string term{*it};
if (has_prefix(term)) if (has_prefix(term))
continue; continue;
// Detect and skip CJK terms. // Detect and skip CJK terms.
Utf8Iter utfit(term); Utf8Iter utfit(term);
if (utfit.eof()) // Empty term?? Seems to happen. if (utfit.eof()) // Empty term?? Seems to happen.
continue; continue;
if (TextSplit::isCJK(*utfit)) { if (TextSplit::isCJK(*utfit)) {
// LOGDEB("stemskipped: Skipping CJK\n"); // LOGDEB("stemskipped: Skipping CJK\n");
continue; continue;
} }
string lower = term; string lower = term;
// If the index is raw, compute the case-folded term which // If the index is raw, compute the case-folded term which
// is the input to the stem db, and add a synonym from the // is the input to the stem db, and add a synonym from the
// stripped term to the cased and accented one, for accent // stripped term to the cased and accented one, for accent
// and case expansion at query time // and case expansion at query time
if (!o_index_stripchars) { if (!o_index_stripchars) {
unacmaybefold(term, lower, "UTF-8", UNACOP_FOLD); unacmaybefold(term, lower, "UTF-8", UNACOP_FOLD);
diacasedb.addSynonym(term); diacasedb.addSynonym(term);
} }
// Dont' apply stemming to terms which don't look like // Dont' apply stemming to terms which don't look like
// natural language words. // natural language words.
if (!Db::isSpellingCandidate(term)) { if (!Db::isSpellingCandidate(term)) {
LOGDEB1("createExpansionDbs: skipped: [" << term << "]\n"); LOGDEB1("createExpansionDbs: skipped: [" << term << "]\n");
continue; continue;
} }
// Create stemming synonym for every language. The input is the // Create stemming synonym for every language. The input is the
// lowercase accented term // lowercase accented term
for (unsigned int i = 0; i < langs.size(); i++) { for (unsigned int i = 0; i < langs.size(); i++) {
stemdbs[i].addSynonym(lower); stemdbs[i].addSynonym(lower);
} }
// For a raw index, also maybe create a stem expansion for // For a raw index, also maybe create a stem expansion for
// the unaccented term. While this may be incorrect, it is // the unaccented term. While this may be incorrect, it is
// also necessary for searching in a diacritic-unsensitive // also necessary for searching in a diacritic-unsensitive
// way on a raw index // way on a raw index
if (!o_index_stripchars) { if (!o_index_stripchars) {
string unac; string unac;
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC); unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
if (unac != lower) { if (unac != lower) {
for (unsigned int i = 0; i < langs.size(); i++) { for (unsigned int i = 0; i < langs.size(); i++) {
unacstemdbs[i].addSynonym(unac); unacstemdbs[i].addSynonym(unac);
} }
} }
} }
} }
} XCATCHERROR(ermsg); } XCATCHERROR(ermsg);
if (!ermsg.empty()) { if (!ermsg.empty()) {

File diff suppressed because it is too large Load Diff

View File

@ -52,21 +52,18 @@ class TermProc {
public: public:
TermProc(TermProc* next) : m_next(next) {} TermProc(TermProc* next) : m_next(next) {}
virtual ~TermProc() {} virtual ~TermProc() {}
virtual bool takeword(const string &term, int pos, int bs, int be) virtual bool takeword(const string &term, int pos, int bs, int be) {
{
if (m_next) if (m_next)
return m_next->takeword(term, pos, bs, be); return m_next->takeword(term, pos, bs, be);
else else
return true; return true;
} }
// newpage() is like takeword(), but for page breaks. // newpage() is like takeword(), but for page breaks.
virtual void newpage(int pos) virtual void newpage(int pos) {
{
if (m_next) if (m_next)
m_next->newpage(pos); m_next->newpage(pos);
} }
virtual bool flush() virtual bool flush() {
{
if (m_next) if (m_next)
return m_next->flush(); return m_next->flush();
else else
@ -137,7 +134,7 @@ public:
// We don't generate a fatal error because of a bad term, // We don't generate a fatal error because of a bad term,
// but one has to put the limit somewhere // but one has to put the limit somewhere
if (m_unacerrors > 500 && if (m_unacerrors > 500 &&
(double(m_totalterms) / double(m_unacerrors)) < 2.0) { (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
// More than 1 error for every other term // More than 1 error for every other term
LOGERR("splitter::takeword: too many unac errors " << LOGERR("splitter::takeword: too many unac errors " <<
m_unacerrors << "/" << m_totalterms << "\n"); m_unacerrors << "/" << m_totalterms << "\n");
@ -147,12 +144,12 @@ public:
} }
if (otrm.empty()) { if (otrm.empty()) {
// It may happen in some weird cases that the output from // It may happen in some weird cases that the output from
// unac is empty (if the word actually consisted entirely // unac is empty (if the word actually consisted entirely
// of diacritics ...) The consequence is that a phrase // of diacritics ...) The consequence is that a phrase
// search won't work without addional slack. // search won't work without addional slack.
return true; return true;
} }
// We should have a Japanese stemmer to handle this, but for // We should have a Japanese stemmer to handle this, but for
// experimenting, let's do it here: remove 'prolounged sound // experimenting, let's do it here: remove 'prolounged sound
@ -174,34 +171,34 @@ public:
return true; return true;
} }
// It may also occur that unac introduces spaces in the string // It may also occur that unac introduces spaces in the string
// (when removing isolated accents, may happen for Greek // (when removing isolated accents, may happen for Greek
// for example). This is a pathological situation. We // for example). This is a pathological situation. We
// index all the resulting terms at the same pos because // index all the resulting terms at the same pos because
// the surrounding code is not designed to handle a pos // the surrounding code is not designed to handle a pos
// change in here. This means that phrase searches and // change in here. This means that phrase searches and
// snippets will be wrong, but at least searching for the // snippets will be wrong, but at least searching for the
// terms will work. // terms will work.
bool hasspace = false; bool hasspace = false;
for (string::const_iterator it = otrm.begin();it < otrm.end();it++) { for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
if (*it == ' ') { if (*it == ' ') {
hasspace=true; hasspace=true;
break; break;
} }
} }
if (hasspace) { if (hasspace) {
std::vector<std::string> terms; std::vector<std::string> terms;
stringToTokens(otrm, terms, " ", true); stringToTokens(otrm, terms, " ", true);
for (std::vector<std::string>::const_iterator it = terms.begin(); for (std::vector<std::string>::const_iterator it = terms.begin();
it < terms.end(); it++) { it < terms.end(); it++) {
if (!TermProc::takeword(*it, pos, bs, be)) { if (!TermProc::takeword(*it, pos, bs, be)) {
return false; return false;
} }
} }
return true; return true;
} else { } else {
return TermProc::takeword(otrm, pos, bs, be); return TermProc::takeword(otrm, pos, bs, be);
} }
} }
virtual bool flush() virtual bool flush()