Merge branch 'master' of https://framagit.org/medoc90/recoll
This commit is contained in:
commit
7656d1b2ef
@ -103,6 +103,7 @@ static bool initCmd()
|
|||||||
|
|
||||||
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||||
{
|
{
|
||||||
|
LOGDEB1("ko_to_words\n");
|
||||||
std::unique_lock<std::mutex> mylock(o_mutex);
|
std::unique_lock<std::mutex> mylock(o_mutex);
|
||||||
initCmd();
|
initCmd();
|
||||||
if (nullptr == o_talker) {
|
if (nullptr == o_talker) {
|
||||||
@ -131,16 +132,13 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
c = *it;
|
c = *it;
|
||||||
if (!isHANGUL(c) && isalpha(c)) {
|
if (!isHANGUL(c) && isalpha(c)) {
|
||||||
// Done with Korean stretch, process and go back to main routine
|
// Done with Korean stretch, process and go back to main routine
|
||||||
//std::cerr << "Broke on char " << (std::string)it << endl;
|
LOGDEB1("ko_to_words: broke on " << (std::string)it << endl);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
if (c == '\f') {
|
if (c == '\f') {
|
||||||
inputdata += magicpage;
|
inputdata += magicpage + " ";
|
||||||
} else {
|
} else {
|
||||||
if (isKomoran && (c == '\n' || c == '\r')) {
|
if (c < 0x20 || (c > 0x7e && c < 0xa0)) {
|
||||||
// Komoran does not like some control chars (initially
|
|
||||||
// thought only formfeed, but not), which is a prob
|
|
||||||
// for pdf pages counts. will need to fix this
|
|
||||||
inputdata += ' ';
|
inputdata += ' ';
|
||||||
} else {
|
} else {
|
||||||
it.appendchartostring(inputdata);
|
it.appendchartostring(inputdata);
|
||||||
@ -175,9 +173,10 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
vector<string> tags;
|
vector<string> tags;
|
||||||
stringToTokens(outtags, tags, sepchars);
|
stringToTokens(outtags, tags, sepchars);
|
||||||
|
|
||||||
// This is the position in the whole text, not the local fragment,
|
// This is the position in the local fragment,
|
||||||
// which is bytepos-orgbytepos
|
// not in the whole text which is orgbytepos + bytepos
|
||||||
string::size_type bytepos(orgbytepos);
|
string::size_type bytepos{0};
|
||||||
|
string::size_type pagefix{0};
|
||||||
for (unsigned int i = 0; i < words.size(); i++) {
|
for (unsigned int i = 0; i < words.size(); i++) {
|
||||||
// The POS tagger strips characters from the input (e.g. multiple
|
// The POS tagger strips characters from the input (e.g. multiple
|
||||||
// spaces, sometimes new lines, possibly other stuff). This
|
// spaces, sometimes new lines, possibly other stuff). This
|
||||||
@ -190,25 +189,32 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
string word = words[i];
|
string word = words[i];
|
||||||
trimstring(word);
|
trimstring(word);
|
||||||
if (word == magicpage) {
|
if (word == magicpage) {
|
||||||
|
LOGDEB1("ko_to_words: NEWPAGE\n");
|
||||||
newpage(m_wordpos);
|
newpage(m_wordpos);
|
||||||
|
bytepos += word.size() + 1;
|
||||||
|
pagefix += word.size();
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
string::size_type newpos = bytepos - orgbytepos;
|
// Find the actual start position of the word in the section.
|
||||||
newpos = inputdata.find(word, newpos);
|
string::size_type newpos = inputdata.find(word, bytepos);
|
||||||
if (newpos != string::npos) {
|
if (newpos != string::npos) {
|
||||||
bytepos = orgbytepos + newpos;
|
bytepos = newpos;
|
||||||
|
} else {
|
||||||
|
LOGDEB("textsplitko: word [" << word << "] not found in text\n");
|
||||||
}
|
}
|
||||||
LOGDEB1("WORD OPOS " << bytepos-orgbytepos <<
|
LOGDEB1("WORD [" << word << "] size " << word.size() <<
|
||||||
" FOUND POS " << newpos << endl);
|
" TAG " << tags[i] << " inputdata size " << inputdata.size() <<
|
||||||
|
" absbytepos " << orgbytepos + bytepos <<
|
||||||
|
" bytepos " << bytepos << " word from text: " <<
|
||||||
|
inputdata.substr(bytepos, word.size()) << endl);
|
||||||
if (tags[i] == "Noun" || tags[i] == "Verb" ||
|
if (tags[i] == "Noun" || tags[i] == "Verb" ||
|
||||||
tags[i] == "Adjective" || tags[i] == "Adverb") {
|
tags[i] == "Adjective" || tags[i] == "Adverb") {
|
||||||
if (!takeword(
|
string::size_type abspos = orgbytepos + bytepos - pagefix;
|
||||||
word, m_wordpos++, bytepos, bytepos + words[i].size())) {
|
if (!takeword(word, m_wordpos++, abspos, abspos + word.size())) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOGDEB1("WORD [" << words[i] << "] size " << words[i].size() <<
|
bytepos += word.size();
|
||||||
" TAG " << tags[i] << endl);
|
|
||||||
bytepos += words[i].size();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if DO_CHECK_THINGS
|
#if DO_CHECK_THINGS
|
||||||
@ -229,5 +235,6 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
clearsplitstate();
|
clearsplitstate();
|
||||||
m_spanpos = m_wordpos = pos;
|
m_spanpos = m_wordpos = pos;
|
||||||
*cp = c;
|
*cp = c;
|
||||||
|
LOGDEB1("ko_to_words: returning\n");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -106,13 +106,14 @@ struct MatchFragment {
|
|||||||
class TextSplitABS : public TextSplit {
|
class TextSplitABS : public TextSplit {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
TextSplitABS(const vector<string>& matchTerms,
|
TextSplitABS(const string& rawtext, const vector<string>& matchTerms,
|
||||||
const HighlightData& hdata,
|
const HighlightData& hdata,
|
||||||
unordered_map<string, double>& wordcoefs,
|
unordered_map<string, double>& wordcoefs,
|
||||||
unsigned int ctxwords,
|
unsigned int ctxwords,
|
||||||
Flags flags,
|
Flags flags,
|
||||||
unsigned int maxterms)
|
unsigned int maxterms)
|
||||||
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
|
: TextSplit(flags), m_rawtext(rawtext),
|
||||||
|
m_terms(matchTerms.begin(), matchTerms.end()),
|
||||||
m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords),
|
m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords),
|
||||||
maxtermcount(maxterms) {
|
maxtermcount(maxterms) {
|
||||||
|
|
||||||
@ -132,7 +133,7 @@ public:
|
|||||||
// Accept a word and its position. If the word is a matched term,
|
// Accept a word and its position. If the word is a matched term,
|
||||||
// add/update fragment definition.
|
// add/update fragment definition.
|
||||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||||
LOGDEB2("takeword: " << term << endl);
|
LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl);
|
||||||
// Limit time taken with monster documents. The resulting
|
// Limit time taken with monster documents. The resulting
|
||||||
// abstract will be incorrect or inexistant, but this is
|
// abstract will be incorrect or inexistant, but this is
|
||||||
// better than taking forever (the default cutoff value comes
|
// better than taking forever (the default cutoff value comes
|
||||||
@ -169,9 +170,9 @@ public:
|
|||||||
|
|
||||||
if (m_terms.find(dumb) != m_terms.end()) {
|
if (m_terms.find(dumb) != m_terms.end()) {
|
||||||
// This word is a search term. Extend or create fragment
|
// This word is a search term. Extend or create fragment
|
||||||
LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first <<
|
LOGDEB1("match: [" << dumb << "] pos " << pos << " bpos " << bts <<
|
||||||
", " << m_curfrag.second << " remain " <<
|
":" << bte << " remainingWords " << m_remainingWords << endl);
|
||||||
m_remainingWords << endl);
|
LOGDEB1("Match text " << m_rawtext.substr(bts, bte - bts) << endl);
|
||||||
double coef = m_wordcoefs[dumb];
|
double coef = m_wordcoefs[dumb];
|
||||||
if (!m_remainingWords) {
|
if (!m_remainingWords) {
|
||||||
// No current fragment. Start one
|
// No current fragment. Start one
|
||||||
@ -219,7 +220,7 @@ public:
|
|||||||
// Term group (phrase/near) handling
|
// Term group (phrase/near) handling
|
||||||
m_plists[dumb].push_back(pos);
|
m_plists[dumb].push_back(pos);
|
||||||
m_gpostobytes[pos] = pair<int,int>(bts, bte);
|
m_gpostobytes[pos] = pair<int,int>(bts, bte);
|
||||||
LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " <<
|
LOGDEB1("Recorded bpos for pos " << pos << ": " << bts << " " <<
|
||||||
bte << "\n");
|
bte << "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -236,6 +237,11 @@ public:
|
|||||||
m_remainingWords--;
|
m_remainingWords--;
|
||||||
m_curfrag.second = bte;
|
m_curfrag.second = bte;
|
||||||
if (m_remainingWords == 0) {
|
if (m_remainingWords == 0) {
|
||||||
|
LOGDEB1("FRAGMENT: from byte " << m_curfrag.first <<
|
||||||
|
" to byte " << m_curfrag.second << endl);
|
||||||
|
LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr(
|
||||||
|
m_curfrag.first, m_curfrag.second-m_curfrag.first)
|
||||||
|
<< "]\n");
|
||||||
// We used to not push weak fragments if we had a lot
|
// We used to not push weak fragments if we had a lot
|
||||||
// already. This can cause problems if the fragments
|
// already. This can cause problems if the fragments
|
||||||
// we drop are actually group fragments (which have
|
// we drop are actually group fragments (which have
|
||||||
@ -337,6 +343,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
const string& m_rawtext;
|
||||||
// Past terms because we need to go back for context before a hit
|
// Past terms because we need to go back for context before a hit
|
||||||
deque<pair<int,int>> m_prevterms;
|
deque<pair<int,int>> m_prevterms;
|
||||||
// Data about the fragment we are building
|
// Data about the fragment we are building
|
||||||
@ -424,7 +431,7 @@ int Query::Native::abstractFromText(
|
|||||||
}
|
}
|
||||||
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
|
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
|
||||||
|
|
||||||
TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
|
TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords,
|
||||||
TextSplit::TXTS_ONLYSPANS,
|
TextSplit::TXTS_ONLYSPANS,
|
||||||
m_q->m_snipMaxPosWalk);
|
m_q->m_snipMaxPosWalk);
|
||||||
splitter.text_to_words(rawtext);
|
splitter.text_to_words(rawtext);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user