This commit is contained in:
Jean-Francois Dockes 2020-04-03 07:34:41 +01:00
commit 7656d1b2ef
2 changed files with 41 additions and 27 deletions

View File

@ -103,6 +103,7 @@ static bool initCmd()
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
{ {
LOGDEB1("ko_to_words\n");
std::unique_lock<std::mutex> mylock(o_mutex); std::unique_lock<std::mutex> mylock(o_mutex);
initCmd(); initCmd();
if (nullptr == o_talker) { if (nullptr == o_talker) {
@ -131,16 +132,13 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
c = *it; c = *it;
if (!isHANGUL(c) && isalpha(c)) { if (!isHANGUL(c) && isalpha(c)) {
// Done with Korean stretch, process and go back to main routine // Done with Korean stretch, process and go back to main routine
//std::cerr << "Broke on char " << (std::string)it << endl; LOGDEB1("ko_to_words: broke on " << (std::string)it << endl);
break; break;
} else { } else {
if (c == '\f') { if (c == '\f') {
inputdata += magicpage; inputdata += magicpage + " ";
} else { } else {
if (isKomoran && (c == '\n' || c == '\r')) { if (c < 0x20 || (c > 0x7e && c < 0xa0)) {
// Komoran does not like some control chars (initially
// thought only formfeed, but not), which is a prob
// for pdf pages counts. will need to fix this
inputdata += ' '; inputdata += ' ';
} else { } else {
it.appendchartostring(inputdata); it.appendchartostring(inputdata);
@ -175,9 +173,10 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
vector<string> tags; vector<string> tags;
stringToTokens(outtags, tags, sepchars); stringToTokens(outtags, tags, sepchars);
// This is the position in the whole text, not the local fragment, // This is the position in the local fragment,
// which is bytepos-orgbytepos // not in the whole text which is orgbytepos + bytepos
string::size_type bytepos(orgbytepos); string::size_type bytepos{0};
string::size_type pagefix{0};
for (unsigned int i = 0; i < words.size(); i++) { for (unsigned int i = 0; i < words.size(); i++) {
// The POS tagger strips characters from the input (e.g. multiple // The POS tagger strips characters from the input (e.g. multiple
// spaces, sometimes new lines, possibly other stuff). This // spaces, sometimes new lines, possibly other stuff). This
@ -190,25 +189,32 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
string word = words[i]; string word = words[i];
trimstring(word); trimstring(word);
if (word == magicpage) { if (word == magicpage) {
LOGDEB1("ko_to_words: NEWPAGE\n");
newpage(m_wordpos); newpage(m_wordpos);
bytepos += word.size() + 1;
pagefix += word.size();
continue;
} }
string::size_type newpos = bytepos - orgbytepos; // Find the actual start position of the word in the section.
newpos = inputdata.find(word, newpos); string::size_type newpos = inputdata.find(word, bytepos);
if (newpos != string::npos) { if (newpos != string::npos) {
bytepos = orgbytepos + newpos; bytepos = newpos;
} else {
LOGDEB("textsplitko: word [" << word << "] not found in text\n");
} }
LOGDEB1("WORD OPOS " << bytepos-orgbytepos << LOGDEB1("WORD [" << word << "] size " << word.size() <<
" FOUND POS " << newpos << endl); " TAG " << tags[i] << " inputdata size " << inputdata.size() <<
" absbytepos " << orgbytepos + bytepos <<
" bytepos " << bytepos << " word from text: " <<
inputdata.substr(bytepos, word.size()) << endl);
if (tags[i] == "Noun" || tags[i] == "Verb" || if (tags[i] == "Noun" || tags[i] == "Verb" ||
tags[i] == "Adjective" || tags[i] == "Adverb") { tags[i] == "Adjective" || tags[i] == "Adverb") {
if (!takeword( string::size_type abspos = orgbytepos + bytepos - pagefix;
word, m_wordpos++, bytepos, bytepos + words[i].size())) { if (!takeword(word, m_wordpos++, abspos, abspos + word.size())) {
return false; return false;
} }
} }
LOGDEB1("WORD [" << words[i] << "] size " << words[i].size() << bytepos += word.size();
" TAG " << tags[i] << endl);
bytepos += words[i].size();
} }
#if DO_CHECK_THINGS #if DO_CHECK_THINGS
@ -229,5 +235,6 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
clearsplitstate(); clearsplitstate();
m_spanpos = m_wordpos = pos; m_spanpos = m_wordpos = pos;
*cp = c; *cp = c;
LOGDEB1("ko_to_words: returning\n");
return true; return true;
} }

View File

@ -106,13 +106,14 @@ struct MatchFragment {
class TextSplitABS : public TextSplit { class TextSplitABS : public TextSplit {
public: public:
TextSplitABS(const vector<string>& matchTerms, TextSplitABS(const string& rawtext, const vector<string>& matchTerms,
const HighlightData& hdata, const HighlightData& hdata,
unordered_map<string, double>& wordcoefs, unordered_map<string, double>& wordcoefs,
unsigned int ctxwords, unsigned int ctxwords,
Flags flags, Flags flags,
unsigned int maxterms) unsigned int maxterms)
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()), : TextSplit(flags), m_rawtext(rawtext),
m_terms(matchTerms.begin(), matchTerms.end()),
m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords), m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords),
maxtermcount(maxterms) { maxtermcount(maxterms) {
@ -132,7 +133,7 @@ public:
// Accept a word and its position. If the word is a matched term, // Accept a word and its position. If the word is a matched term,
// add/update fragment definition. // add/update fragment definition.
virtual bool takeword(const std::string& term, int pos, int bts, int bte) { virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
LOGDEB2("takeword: " << term << endl); LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl);
// Limit time taken with monster documents. The resulting // Limit time taken with monster documents. The resulting
// abstract will be incorrect or inexistant, but this is // abstract will be incorrect or inexistant, but this is
// better than taking forever (the default cutoff value comes // better than taking forever (the default cutoff value comes
@ -169,9 +170,9 @@ public:
if (m_terms.find(dumb) != m_terms.end()) { if (m_terms.find(dumb) != m_terms.end()) {
// This word is a search term. Extend or create fragment // This word is a search term. Extend or create fragment
LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first << LOGDEB1("match: [" << dumb << "] pos " << pos << " bpos " << bts <<
", " << m_curfrag.second << " remain " << ":" << bte << " remainingWords " << m_remainingWords << endl);
m_remainingWords << endl); LOGDEB1("Match text " << m_rawtext.substr(bts, bte - bts) << endl);
double coef = m_wordcoefs[dumb]; double coef = m_wordcoefs[dumb];
if (!m_remainingWords) { if (!m_remainingWords) {
// No current fragment. Start one // No current fragment. Start one
@ -219,7 +220,7 @@ public:
// Term group (phrase/near) handling // Term group (phrase/near) handling
m_plists[dumb].push_back(pos); m_plists[dumb].push_back(pos);
m_gpostobytes[pos] = pair<int,int>(bts, bte); m_gpostobytes[pos] = pair<int,int>(bts, bte);
LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " << LOGDEB1("Recorded bpos for pos " << pos << ": " << bts << " " <<
bte << "\n"); bte << "\n");
} }
} }
@ -236,6 +237,11 @@ public:
m_remainingWords--; m_remainingWords--;
m_curfrag.second = bte; m_curfrag.second = bte;
if (m_remainingWords == 0) { if (m_remainingWords == 0) {
LOGDEB1("FRAGMENT: from byte " << m_curfrag.first <<
" to byte " << m_curfrag.second << endl);
LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr(
m_curfrag.first, m_curfrag.second-m_curfrag.first)
<< "]\n");
// We used to not push weak fragments if we had a lot // We used to not push weak fragments if we had a lot
// already. This can cause problems if the fragments // already. This can cause problems if the fragments
// we drop are actually group fragments (which have // we drop are actually group fragments (which have
@ -337,6 +343,7 @@ public:
} }
private: private:
const string& m_rawtext;
// Past terms because we need to go back for context before a hit // Past terms because we need to go back for context before a hit
deque<pair<int,int>> m_prevterms; deque<pair<int,int>> m_prevterms;
// Data about the fragment we are building // Data about the fragment we are building
@ -424,7 +431,7 @@ int Query::Native::abstractFromText(
} }
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n"); LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords, TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords,
TextSplit::TXTS_ONLYSPANS, TextSplit::TXTS_ONLYSPANS,
m_q->m_snipMaxPosWalk); m_q->m_snipMaxPosWalk);
splitter.text_to_words(rawtext); splitter.text_to_words(rawtext);