/* Copyright (C) 2017-2019 J.F.Dockes * * License: GPL 2.1 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #ifndef _hldata_h_included_ #define _hldata_h_included_ #include #include #include #include /** Store data about user search terms and their expansions. This is used * mostly for highlighting result text and walking the matches, generating * spelling suggestions. */ struct HighlightData { /** The user terms, excluding those with wildcards. This list is * intended for orthographic suggestions so the terms are always * lowercased, unaccented or not depending on the type of index * (as the spelling dictionary is generated from the index terms). */ std::set uterms; /** The db query terms linked to the uterms entry they were expanded from. * This is used for aggregating term stats when generating snippets (for * choosing the best terms, allocating slots, etc. ) */ std::unordered_map terms; /** The original user terms-or-groups. This is for display * purposes: ie when creating a menu to look for a specific * matched group inside a preview window. We want to show the * user-entered data in the menu, not some transformation, so * these are always raw, diacritics and case preserved. */ std::vector > ugroups; /** Processed/expanded terms and groups. Used for looking for * regions to highlight. A group can be a PHRASE or NEAR entry * Terms are just groups with 1 entry. All * terms are transformed to be compatible with index content * (unaccented and lowercased as needed depending on * configuration), and the list may include values * expanded from the original terms by stem or wildcard expansion. */ struct TermGroup { // We'd use an union but no can do std::string term; std::vector > orgroups; int slack{0}; /* Index into ugroups. As a user term or group may generate * many processed/expanded terms or groups, this is how we * relate an expansion to its source (used, e.g. for * generating anchors for walking search matches in the * preview window). */ size_t grpsugidx{0}; enum TGK {TGK_TERM, TGK_NEAR, TGK_PHRASE}; TGK kind{TGK_TERM}; }; std::vector index_term_groups; void clear() { uterms.clear(); ugroups.clear(); index_term_groups.clear(); } void append(const HighlightData&); // Print (debug) std::string toString() const; }; /* The following is used by plaintorich.cpp for finding zones to highlight and by rclabsfromtext.cpp to choose fragments for the abstract */ struct GroupMatchEntry { // Start/End byte offsets in the document text std::pair offs; // Index of the search group this comes from: this is to relate a // match to the original user input. size_t grpidx; GroupMatchEntry(int sta, int sto, size_t idx) : offs(sta, sto), grpidx(idx) { } }; // Find NEAR or PHRASE matches for one group of terms. // // @param hldata User query expansion descriptor (see above). We only use // the index_term_groups entry // // @param grpidx Index in hldata.index_term_groups for the group we // process. This is used by us to get the terms, group type // (phrase/near) and slacks. We also set it in the output // GroupMatchEntry structures to allow the caller to link a match // with a specific user input (e.g. for walking the match in the // GUI preview) // // @param inplists Position lists for the the group terms. This is the // data used to look for matches. // // @param gpostobytes Translation of term position to start/end byte // offsets. This is used to translate term positions to byte // positions in the output, for ease of use by caller. // // @param[out] tboffs Found matches. Each match has a begin and end // byte offset and an index linking to the origin data in the // HighlightData structure. extern bool matchGroup( const HighlightData& hldata, unsigned int grpidx, const std::unordered_map>& inplists, const std::unordered_map>& gpostobytes, std::vector& tboffs ); #endif /* _hldata_h_included_ */