From f778274b42e76a4601ed173704bd7d84d09d8faf Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 19 Jan 2018 13:17:15 +0100 Subject: [PATCH 1/4] Fix Windows PDF indexing. The successful test for poppler/pdftotext was not acknowledged and pdf indexing always failed --- src/filters/rclpdf.py | 83 ++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 56 deletions(-) diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 603e1d30..1e6852ea 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -86,9 +86,10 @@ class PDFExtractor: self.pdftotext = rclexecm.which("pdftotext") if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") - # No need for anything else. openfile() will return an - # error at once - return + if not self.pdftotext: + # No need for anything else. openfile() will return an + # error at once + return cf = rclconfig.RclConfig() self.confdir = cf.getConfDir() @@ -98,7 +99,6 @@ class PDFExtractor: # (xmltag,rcltag) pairs self.extrameta = cf.getConfParam("pdfextrameta") if self.extrameta: - self.extrametafix = cf.getConfParam("pdfextrametafix") self._initextrameta() # Check if we need to escape portions of text where old @@ -179,16 +179,7 @@ class PDFExtractor: self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' + r'(.*)' + r'<\?xpacket[ ]+end', flags = re.DOTALL) - global EMF - EMF = None - if self.extrametafix: - try: - import imp - EMF = imp.load_source('pdfextrametafix', self.extrametafix) - except Exception as err: - self.em.rclog("Import extrametafix failed: %s" % err) - pass - + # Extract all attachments if any into temporary directory def extractAttach(self): if self.attextractdone: @@ -366,17 +357,17 @@ class PDFExtractor: return output, isempty def _metatag(self, nm, val): - return b"" # metaheaders is a list of (nm, value) pairs def _injectmeta(self, html, metaheaders): - metatxt = b'' + metatxt = '' for nm, val in metaheaders: - metatxt += self._metatag(nm, val) + b'\n' + metatxt += self._metatag(nm, val) + '\n' if not metatxt: return html - res = self.re_head.sub(b'\n' + metatxt, html) + res = self.re_head.sub('\n' + metatxt, html) #self.em.rclog("Substituted html: [%s]"%res) if res: return res @@ -392,38 +383,30 @@ class PDFExtractor: return text.strip() # or: return reduce((lambda t,p : t+p+' '), # [e.text for e in elt.iter() if e.text]).strip() - - + def _setextrameta(self, html): if not self.pdfinfo: - return html + return - emf = EMF.MetaFixer() if EMF else None - - # Execute pdfinfo and extract the XML packet all = subprocess.check_output([self.pdfinfo, "-meta", self.filename]) + + # Extract the XML packet res = self.re_xmlpacket.search(all) - xml = res.group(1) if res else '' - #self.em.rclog("extrameta: XML: [%s]" % xml) + xml = '' + if res: + xml = res.group(1) + # self.em.rclog("extrameta: XML: [%s]" % xml) if not xml: return html - # Process the XML data - root = ET.fromstring(xml) - # Sometimes the root tag is , sometimes + metaheaders = [] # The namespace thing is a drag. Can't do it from the top. See # the stackoverflow ref above. Maybe we'd be better off just # walking the full tree and building the namespaces dict. - if root.tag.endswith('RDF'): - rdf = root - else: - namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} - rdf = root.find("rdf:RDF", namespaces) - if rdf is None: - self.em.rclog("No rdf:RDF node"); - return html - - metaheaders = [] + root = ET.fromstring(xml) + #self.em.rclog("NSMAP: %s"% root.nsmap) + namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"} + rdf = root.find("rdf:RDF", namespaces) #self.em.rclog("RDF NSMAP: %s"% rdf.nsmap) rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap) #self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap) @@ -436,27 +419,15 @@ class PDFExtractor: # define the required namespace. continue if elt is not None: - text = self._xmltreetext(elt).encode('UTF-8') - if emf: - try: - text = emf.metafix(metanm, text) - except: - pass - # Should we set empty values ? + text = self._xmltreetext(elt) if text: + # Should we set empty values ? # Can't use setfield as it only works for # text/plain output at the moment. metaheaders.append((rclnm, text)) if metaheaders: - if emf: - try: - emf.wrapup(metaheaders) - except: - pass return self._injectmeta(html, metaheaders) - else: - return html - + def _selfdoc(self): '''Extract the text from the pdf doc (as opposed to attachment)''' self.em.setmimetype('text/html') @@ -465,13 +436,13 @@ class PDFExtractor: eof = rclexecm.RclExecM.eofnext else: eof = rclexecm.RclExecM.noteof - + html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc", "UTF-8", "-eol", "unix", "-q", self.filename, "-"]) html, isempty = self._fixhtml(html) - #self.em.rclog("after _fixhtml: isempty %d html: \n%s" % (isempty, html)) + #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html)) if isempty and self.ocrpossible: html = self.ocrpdf() From 5b35ecfe365bdb088d37b5fc881221d7d3d35fc6 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 19 Jan 2018 17:26:43 +0100 Subject: [PATCH 2/4] Windows warning suppression (no real changes) --- src/common/textsplit.cpp | 2 +- src/index/exefetcher.cpp | 2 -- src/rcldb/rcldb.cpp | 2 ++ src/utils/smallut.h | 2 ++ 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 28f91c13..ae85127a 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -492,7 +492,7 @@ static inline bool isdigit(int what, unsigned int flgs) #define STATS_INC_WORDCHARS #endif -vector splitFlags = { +vector splitFlags{ {TextSplit::TXTS_NOSPANS, "nospans"}, {TextSplit::TXTS_ONLYSPANS, "onlyspans"}, {TextSplit::TXTS_KEEPWILD, "keepwild"} diff --git a/src/index/exefetcher.cpp b/src/index/exefetcher.cpp index ab6feb4c..8dda0208 100644 --- a/src/index/exefetcher.cpp +++ b/src/index/exefetcher.cpp @@ -79,8 +79,6 @@ bool EXEDocFetcher::makesig(RclConfig* cnf, const Rcl::Doc& idoc, string& sig) // Lookup bckid in the config and create an appropriate fetcher. EXEDocFetcher *exeDocFetcherMake(RclConfig *config, const string& bckid) { - EXEDocFetcher *fetcher = 0; - // The config we only read once, not gonna change. static ConfSimple *bconf; if (!bconf) { diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 4f35516c..208775cf 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -753,7 +753,9 @@ Db::~Db() LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " << m_ndb->m_iswritable << "\n"); i_close(true); +#ifdef RCL_USE_ASPELL delete m_aspell; +#endif delete m_config; } diff --git a/src/utils/smallut.h b/src/utils/smallut.h index d159f1f7..1ab958c0 100644 --- a/src/utils/smallut.h +++ b/src/utils/smallut.h @@ -227,6 +227,8 @@ private: /// Entries for the descriptive table struct CharFlags { + CharFlags(int v, const char *y, const char *n=0) + : value(v), yesname(y), noname(n) {} unsigned int value; // Flag or value const char *yesname;// String to print if flag set or equal const char *noname; // String to print if flag not set (unused for values) From fd328722184b4bf5e43668f0ee588f2b242beb65 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 20 Jan 2018 11:59:00 +0100 Subject: [PATCH 3/4] Improve 'rebuild index' under Windows: this often failed because of some open files in the Xapian db (could not be deleted under windows). Now only fails if a preview has been opened, and a GUI restart fixes the situation. --- src/qtgui/rclm_idx.cpp | 31 ++++++++++++++++++++++++++++--- src/qtgui/rclm_wins.cpp | 15 +++++++++------ src/qtgui/rclmain_w.cpp | 6 +++--- src/qtgui/rclmain_w.h | 2 ++ src/qtgui/reslist.cpp | 2 ++ src/rcldb/rcldb.cpp | 10 ++++++++++ 6 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/qtgui/rclm_idx.cpp b/src/qtgui/rclm_idx.cpp index c5fe9186..be186dda 100644 --- a/src/qtgui/rclm_idx.cpp +++ b/src/qtgui/rclm_idx.cpp @@ -29,6 +29,7 @@ #include "rclmain_w.h" #include "specialindex.h" #include "readfile.h" +#include "snippets_w.h" using namespace std; @@ -291,8 +292,20 @@ void RclMain::toggleIndexing() } } +static void delay(int millisecondsWait) +{ + QEventLoop loop; + QTimer t; + t.connect(&t, &QTimer::timeout, &loop, &QEventLoop::quit); + t.start(millisecondsWait); + loop.exec(); +} + void RclMain::rebuildIndex() { + if (m_indexerState == IXST_UNKNOWN) { + delay(1500); + } switch (m_indexerState) { case IXST_UNKNOWN: case IXST_RUNNINGMINE: @@ -313,12 +326,24 @@ void RclMain::rebuildIndex() QMessageBox::NoButton); if (rep == QMessageBox::Ok) { #ifdef _WIN32 - // Under windows, it's necessary to close the db here, + // Under windows, it is necessary to close the db here, // else Xapian won't be able to do what it wants with the // (open) files. Of course if there are several GUI - // instances, this won't work... - if (rcldb) + // instances, this won't work... Also it's quite difficult + // to make sure that there are no more references to the + // db because, for example of the Enquire objects inside + // Query inside Docsource etc. + // + // !! At this moment, this does not work if a preview has + // !! been opened. Could not find the reason (mysterious + // !! Xapian::Database reference somewhere?). The indexing + // !! fails, leaving a partial index directory. Then need + // !! to restart the GUI to succeed in reindexing. + if (rcldb) { + resetSearch(); + deleteZ(m_snippets); rcldb->close(); + } #endif // _WIN32 // Could also mean that no helpers are missing, but then we // won't try to show a message anyway (which is what diff --git a/src/qtgui/rclm_wins.cpp b/src/qtgui/rclm_wins.cpp index 9594a72c..d506d6f5 100644 --- a/src/qtgui/rclm_wins.cpp +++ b/src/qtgui/rclm_wins.cpp @@ -451,13 +451,16 @@ void RclMain::newDupsW(const Rcl::Doc, const vector dups) void RclMain::showSnippets(Rcl::Doc doc) { - SnippetsW *sp = new SnippetsW(doc, m_source); - connect(sp, SIGNAL(startNativeViewer(Rcl::Doc, int, QString)), + if (m_snippets) { + deleteZ(m_snippets); + } + m_snippets = new SnippetsW(doc, m_source); + connect(m_snippets, SIGNAL(startNativeViewer(Rcl::Doc, int, QString)), this, SLOT(startNativeViewer(Rcl::Doc, int, QString))); - connect(new QShortcut(quitKeySeq, sp), SIGNAL (activated()), + connect(new QShortcut(quitKeySeq, m_snippets), SIGNAL (activated()), this, SLOT (fileExit())); - connect(new QShortcut(closeKeySeq, sp), SIGNAL (activated()), - sp, SLOT (close())); - sp->show(); + connect(new QShortcut(closeKeySeq, m_snippets), SIGNAL (activated()), + m_snippets, SLOT (close())); + m_snippets->show(); } diff --git a/src/qtgui/rclmain_w.cpp b/src/qtgui/rclmain_w.cpp index 41438f65..f61369ec 100644 --- a/src/qtgui/rclmain_w.cpp +++ b/src/qtgui/rclmain_w.cpp @@ -531,7 +531,7 @@ void RclMain::initDbOpen() connect(fidia.idxschedCLB, SIGNAL(clicked()), this, SLOT(execIndexSched())); connect(fidia.runidxPB, SIGNAL(clicked()), - this, SLOT(toggleIndexing())); + this, SLOT(rebuildIndex())); fidia.exec(); // Don't open adv search or run cmd line search in this case. return; @@ -788,6 +788,7 @@ void RclMain::initiateQuery() void RclMain::resetSearch() { + m_source = std::shared_ptr(); emit searchReset(); } @@ -974,8 +975,7 @@ void RclMain::docExpand(Rcl::Doc doc) void RclMain::showDocHistory() { LOGDEB("RclMain::showDocHistory\n"); - emit searchReset(); - m_source = std::shared_ptr(); + resetSearch(); curPreview = 0; string reason; diff --git a/src/qtgui/rclmain_w.h b/src/qtgui/rclmain_w.h index d7add82e..1eedfc0f 100644 --- a/src/qtgui/rclmain_w.h +++ b/src/qtgui/rclmain_w.h @@ -35,6 +35,7 @@ #include "guiutils.h" #include "rclutil.h" +class SnippetsW; class IdxSchedW; class ExecCmd; class Preview; @@ -202,6 +203,7 @@ protected: private: + SnippetsW *m_snippets{0}; Preview *curPreview; AdvSearch *asearchform; UIPrefsDialog *uiprefs; diff --git a/src/qtgui/reslist.cpp b/src/qtgui/reslist.cpp index 6dd430d3..15d668cf 100644 --- a/src/qtgui/reslist.cpp +++ b/src/qtgui/reslist.cpp @@ -407,6 +407,8 @@ void ResList::setDocSource(std::shared_ptr nsource) { LOGDEB("ResList::setDocSource()\n"); m_source = std::shared_ptr(new DocSource(theconfig, nsource)); + if (m_pager) + m_pager->setDocSource(m_source); } // A query was executed, or the filtering/sorting parameters changed, diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 208775cf..0d47bf9b 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -58,6 +58,7 @@ using namespace std; #include "rclinit.h" #include "internfile.h" #include "utf8fn.h" +#include "wipedir.h" #ifdef RCL_USE_ASPELL #include "rclaspell.h" #endif @@ -793,6 +794,15 @@ bool Db::open(OpenMode mode, OpenError *error) case DbUpd: case DbTrunc: { + // Xapian is quite bad at erasing partial db which can + // occur because of open file deletion errors on + // Windows. + if (mode == DbTrunc) { + if (path_exists(path_cat(dir, "iamchert"))) { + wipedir(dir); + unlink(dir.c_str()); + } + } int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN : Xapian::DB_CREATE_OR_OVERWRITE; if (::access(dir.c_str(), 0) != 0) { From ac09a98b10e792f0f21de208d947b7e1e0ce32d3 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 20 Jan 2018 12:06:39 +0100 Subject: [PATCH 4/4] bumped version --- src/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VERSION b/src/VERSION index 2d27ccba..82bfa5ce 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.23.7 +1.23.8