web page indexing doc and indents
This commit is contained in:
parent
f57530e2a6
commit
5c7d0ff96d
@ -37,15 +37,15 @@ WebStore::WebStore(RclConfig *cnf)
|
||||
int maxmbs = 40;
|
||||
cnf->getConfParam("webcachemaxmbs", &maxmbs);
|
||||
if ((m_cache = new CirCache(ccdir)) == 0) {
|
||||
LOGERR("WebStore: cant create CirCache object\n" );
|
||||
return;
|
||||
LOGERR("WebStore: cant create CirCache object\n" );
|
||||
return;
|
||||
}
|
||||
if (!m_cache->create(int64_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE)) {
|
||||
LOGERR("WebStore: cache file creation failed: " <<
|
||||
LOGERR("WebStore: cache file creation failed: " <<
|
||||
m_cache->getReason() << "\n");
|
||||
delete m_cache;
|
||||
m_cache = 0;
|
||||
return;
|
||||
delete m_cache;
|
||||
m_cache = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@ -57,17 +57,17 @@ WebStore::~WebStore()
|
||||
// Read document from cache. Return the metadata as an Rcl::Doc
|
||||
// @param htt Web Hit Type
|
||||
bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
|
||||
string& data, string *htt)
|
||||
string& data, string *htt)
|
||||
{
|
||||
string dict;
|
||||
|
||||
if (m_cache == 0) {
|
||||
LOGERR("WebStore::getFromCache: cache is null\n");
|
||||
return false;
|
||||
LOGERR("WebStore::getFromCache: cache is null\n");
|
||||
return false;
|
||||
}
|
||||
if (!m_cache->get(udi, dict, &data)) {
|
||||
LOGDEB("WebStore::getFromCache: get failed\n");
|
||||
return false;
|
||||
LOGDEB("WebStore::getFromCache: get failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
ConfSimple cf(dict, 1);
|
||||
@ -89,4 +89,3 @@ bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
|
||||
dotdoc.meta[Rcl::Doc::keyudi] = udi;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -21,8 +21,8 @@
|
||||
|
||||
class RclConfig;
|
||||
namespace Rcl {
|
||||
class Db;
|
||||
class Doc;
|
||||
class Db;
|
||||
class Doc;
|
||||
}
|
||||
class CirCache;
|
||||
|
||||
|
||||
@ -140,9 +140,9 @@ alink="#0000FF">
|
||||
"#RCL.INDEXING.REMOVABLE">Removable
|
||||
volumes</a></span></dt>
|
||||
<dt><span class="sect1">2.5. <a href=
|
||||
"#RCL.INDEXING.WEBQUEUE"><span class=
|
||||
"#RCL.INDEXING.WebQUEUE"><span class=
|
||||
"application">Unix</span>-like systems: indexing
|
||||
visited WEB pages</a></span></dt>
|
||||
visited Web pages</a></span></dt>
|
||||
<dt><span class="sect1">2.6. <a href=
|
||||
"#RCL.INDEXING.EXTATTR"><span class=
|
||||
"application">Unix</span>-like systems: using extended
|
||||
@ -423,7 +423,7 @@ alink="#0000FF">
|
||||
<div class="list-of-tables">
|
||||
<p><b>List of Tables</b></p>
|
||||
<dl>
|
||||
<dt>3.1. <a href="#idm1437">Keyboard shortcuts</a></dt>
|
||||
<dt>3.1. <a href="#idm1438">Keyboard shortcuts</a></dt>
|
||||
</dl>
|
||||
</div>
|
||||
<div class="chapter">
|
||||
@ -720,7 +720,7 @@ alink="#0000FF">
|
||||
<li class="listitem">
|
||||
<p>A <a class="ulink" href=
|
||||
"https://framagit.org/medoc92/recollwebui" target=
|
||||
"_top">WEB interface</a>.</p>
|
||||
"_top">Web interface</a>.</p>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
@ -1949,10 +1949,10 @@ recollindex -c "$confdir"
|
||||
<div>
|
||||
<div>
|
||||
<h2 class="title" style="clear: both"><a name=
|
||||
"RCL.INDEXING.WEBQUEUE" id=
|
||||
"RCL.INDEXING.WEBQUEUE"></a>2.5. <span class=
|
||||
"RCL.INDEXING.WebQUEUE" id=
|
||||
"RCL.INDEXING.WebQUEUE"></a>2.5. <span class=
|
||||
"application">Unix</span>-like systems: indexing
|
||||
visited WEB pages</h2>
|
||||
visited Web pages</h2>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@ -1964,57 +1964,48 @@ recollindex -c "$confdir"
|
||||
"application">Beagle</span> indexer, then adapted to
|
||||
<span class="application">Recoll</span> and the
|
||||
<span class="application">Firefox</span> <span class=
|
||||
"application">XUL</span> API. A new version of the addon
|
||||
has been written to work with the <span class=
|
||||
"application">WebExtensions</span> API, which is the only
|
||||
one supported after <span class=
|
||||
"application">Firefox</span> version 57.</p>
|
||||
<p>The extension works by copying visited WEB pages to an
|
||||
"application">XUL</span> API. The current version of the
|
||||
extension is located in the <a class="ulink" href=
|
||||
"https://addons.mozilla.org/en-US/firefox/addon/recoll-we/"
|
||||
target="_top">Mozilla add-ons repository</a> uses the
|
||||
<span class="application">WebExtensions</span> API, and
|
||||
works with current <span class="application">Firefox</span>
|
||||
versions.</p>
|
||||
<p>The extension works by copying visited Web pages to an
|
||||
indexing queue directory, which <span class=
|
||||
"application">Recoll</span> then processes, indexing the
|
||||
data, storing it into a local cache, then removing the file
|
||||
from the queue.</p>
|
||||
<p>Because the WebExtensions API introduces more
|
||||
constraints to what extensions can do, the new version
|
||||
works with one more step: the files are first created in
|
||||
the browser default downloads location (typically
|
||||
<code class="filename">$HOME/Downloads</code> ), then moved
|
||||
by a script in the old queue location. The script is
|
||||
automatically executed by the <span class=
|
||||
"application">Recoll</span> indexer versions 1.23.5 and
|
||||
newer. It could conceivably be executed independently to
|
||||
make the new browser extension compatible with an older
|
||||
<span class="application">Recoll</span> version (the script
|
||||
is named <span class=
|
||||
"command"><strong>recoll-we-move-files.py</strong></span>).</p>
|
||||
<div class="note" style=
|
||||
"margin-left: 0.5in; margin-right: 0.5in;">
|
||||
<h3 class="title">Note</h3>
|
||||
<p>For the WebExtensions-based version to work, it is
|
||||
necessary to set the <code class=
|
||||
"literal">webdownloadsdir</code> value in the
|
||||
configuration if it was changed from the default
|
||||
<code class="filename">$HOME/Downloads</code> in the
|
||||
browser preferences.</p>
|
||||
</div>
|
||||
<p>The visited WEB pages indexing feature can be enabled on
|
||||
"application">Recoll</span> then processes, storing the
|
||||
data into a local cache, then indexing it, then removing
|
||||
the file from the queue.</p>
|
||||
<p>The visited Web pages indexing feature can be enabled on
|
||||
the <span class="application">Recoll</span> side from the
|
||||
GUI <span class="guilabel">Index configuration</span>
|
||||
panel, or by editing the configuration file (set
|
||||
<code class="varname">processwebqueue</code> to 1).</p>
|
||||
<p>A current pointer to the extension can be found, along
|
||||
with up-to-date instructions, on the <a class="ulink" href=
|
||||
<p>The <span class="application">Recoll</span> GUI has a
|
||||
tool to list and edit the contents of the Web cache.
|
||||
(<span class="guimenu">Tools</span> → <span class=
|
||||
"guimenuitem">Webcache editor</span>)</p>
|
||||
<p>You can find more details on Web indexing, its usage and
|
||||
configuration in a <a class="ulink" href=
|
||||
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
|
||||
target="_top">Recoll wiki</a>.</p>
|
||||
<p>A copy of the indexed WEB pages is retained by Recoll in
|
||||
a local cache (from which previews can be fetched). The
|
||||
cache size can be adjusted from the <span class=
|
||||
"guilabel">Index configuration</span> / <span class=
|
||||
"guilabel">Web history</span> panel. Once the maximum size
|
||||
is reached, old pages are purged - both from the cache and
|
||||
the index - to make room for new ones, so you need to
|
||||
explicitly archive in some other place the pages that you
|
||||
want to keep indefinitely.</p>
|
||||
target="_top">Recoll 'Howto' entry</a>.</p>
|
||||
<div class="note" style=
|
||||
"margin-left: 0.5in; margin-right: 0.5in;">
|
||||
<h3 class="title">The cache is not an archive</h3>
|
||||
<p>A copy of the indexed Web pages is retained by Recoll
|
||||
in a local cache (from which data is fetched for
|
||||
previews, or when resetting the index). The cache has a
|
||||
maximum size, which can be adjusted from the <span class=
|
||||
"guilabel">Index configuration</span> / <span class=
|
||||
"guilabel">Web history</span> panel (<code class=
|
||||
"literal">webcachemaxmbs</code> parameter in <code class=
|
||||
"filename">recoll.conf</code>). Once the maximum size is
|
||||
reached, old pages are erased to make room for new ones.
|
||||
The pages which you want to keep indefinitely need to be
|
||||
explicitly archived elsewhere. Using a very high value
|
||||
for the cache size can avoid data erasure, but see the
|
||||
above 'Howto' page for more details and gotchas.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect1">
|
||||
<div class="titlepage">
|
||||
@ -3473,14 +3464,14 @@ fs.inotify.max_user_watches=32768
|
||||
be able to find under <code class=
|
||||
"filename">/usr/share/recoll/examples/fragbuts.xml</code>),
|
||||
contains an example which filters the results from the
|
||||
WEB history.</p>
|
||||
Web history.</p>
|
||||
<p>Here follows an example:</p>
|
||||
<pre class="programlisting">
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<fragbuts version="1.0">
|
||||
|
||||
<radiobuttons>
|
||||
<!-- Actually useful: toggle WEB queue results inclusion -->
|
||||
<!-- Actually useful: toggle Web queue results inclusion -->
|
||||
<fragbut>
|
||||
<label>Include Web Results</label>
|
||||
<frag></frag>
|
||||
@ -3996,7 +3987,7 @@ fs.inotify.max_user_watches=32768
|
||||
given context (e.g. within a preview window, within the
|
||||
result table).</p>
|
||||
<div class="table">
|
||||
<a name="idm1437" id="idm1437"></a>
|
||||
<a name="idm1438" id="idm1438"></a>
|
||||
<p class="title"><b>Table 3.1. Keyboard
|
||||
shortcuts</b></p>
|
||||
<div class="table-contents">
|
||||
@ -7940,11 +7931,11 @@ hasextract = False
|
||||
"application">FreeBSD</span> ports, etc.), or from some
|
||||
type of "backports" repository providing versions newer
|
||||
than the standard ones, or found on the <span class=
|
||||
"application">Recoll</span> WEB site in some cases. The
|
||||
"application">Recoll</span> Web site in some cases. The
|
||||
most up-to-date information about Recoll packages can
|
||||
usually be found on the <a class="ulink" href=
|
||||
"http://www.recoll.org/pages/download.html" target=
|
||||
"_top"><span class="application">Recoll</span> WEB site
|
||||
"_top"><span class="application">Recoll</span> Web site
|
||||
downloads page</a></p>
|
||||
<p>The <span class="application">Windows</span> version of
|
||||
Recoll comes in a self-contained setup file, there is
|
||||
|
||||
@ -282,7 +282,7 @@
|
||||
<ulink url="https://www.lesbonscomptes.com/recoll/pages/download.html">Search Provider</ulink>.
|
||||
</para></listitem>
|
||||
<listitem><para>A
|
||||
<ulink url="https://framagit.org/medoc92/recollwebui">WEB interface</ulink>.
|
||||
<ulink url="https://framagit.org/medoc92/recollwebui">Web interface</ulink>.
|
||||
</para></listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
@ -1257,56 +1257,51 @@ recollindex -c "$confdir"
|
||||
</simplesect>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="RCL.INDEXING.WEBQUEUE">
|
||||
<title>&LIN;: indexing visited WEB pages</title>
|
||||
<sect1 id="RCL.INDEXING.WebQUEUE">
|
||||
<title>&LIN;: indexing visited Web pages</title>
|
||||
|
||||
<para>With the help of a <application>Firefox</application>
|
||||
extension, &RCL; can index the Internet pages that you visit. The
|
||||
extension has a long history: it was initially designed for the
|
||||
<application>Beagle</application> indexer, then adapted to &RCL; and
|
||||
the <application>Firefox</application> <application>XUL</application>
|
||||
API. A new version of the addon has been written to work with the
|
||||
<application>WebExtensions</application> API, which is the only one
|
||||
supported after <application>Firefox</application> version 57.</para>
|
||||
extension, &RCL; can index the Internet pages that you visit. The
|
||||
extension has a long history: it was initially designed for
|
||||
the <application>Beagle</application> indexer, then adapted to
|
||||
&RCL; and
|
||||
the <application>Firefox</application> <application>XUL</application>
|
||||
API. The current version of the extension is located in
|
||||
the <ulink url="https://addons.mozilla.org/en-US/firefox/addon/recoll-we/">Mozilla
|
||||
add-ons repository</ulink> uses
|
||||
the <application>WebExtensions</application> API, and works with
|
||||
current <application>Firefox</application> versions.</para>
|
||||
|
||||
<para>The extension works by copying visited WEB pages to an indexing
|
||||
queue directory, which &RCL; then processes, indexing the data,
|
||||
storing it into a local cache, then removing the file from the
|
||||
queue.</para>
|
||||
|
||||
<para>Because the WebExtensions API introduces more constraints to
|
||||
what extensions can do, the new version works with one
|
||||
more step: the files are first created in the browser default
|
||||
downloads location (typically <filename>$HOME/Downloads</filename> ),
|
||||
then moved by a script in the old queue location. The script is
|
||||
automatically executed by the &RCL; indexer versions 1.23.5 and
|
||||
newer. It could conceivably be executed independently to make the new
|
||||
browser extension compatible with an older &RCL; version (the script
|
||||
is named <command>recoll-we-move-files.py</command>).</para>
|
||||
|
||||
<note><para>For the WebExtensions-based version to work, it is
|
||||
necessary to set the <literal>webdownloadsdir</literal> value in the
|
||||
configuration if it was changed from the default
|
||||
<filename>$HOME/Downloads</filename> in the browser
|
||||
preferences.</para></note>
|
||||
<para>The extension works by copying visited Web pages to an indexing
|
||||
queue directory, which &RCL; then processes, storing the data into a
|
||||
local cache, then indexing it, then removing the file from the
|
||||
queue.</para>
|
||||
|
||||
<para>The visited WEB pages indexing feature can be enabled on the
|
||||
&RCL; side from the GUI <guilabel>Index configuration</guilabel>
|
||||
panel, or by editing the configuration file (set
|
||||
<varname>processwebqueue</varname> to 1).</para>
|
||||
<para>The visited Web pages indexing feature can be enabled on the
|
||||
&RCL; side from the GUI <guilabel>Index configuration</guilabel>
|
||||
panel, or by editing the configuration file (set
|
||||
<varname>processwebqueue</varname> to 1).</para>
|
||||
|
||||
<para>A current pointer to the extension can be found, along with
|
||||
up-to-date instructions, on the
|
||||
<ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para>
|
||||
<para>The &RCL; GUI has a tool to list and edit the contents of the
|
||||
Web
|
||||
cache. (<menuchoice><guimenu>Tools</guimenu><guimenuitem>Webcache
|
||||
editor</guimenuitem></menuchoice>)</para>
|
||||
|
||||
<para>A copy of the indexed WEB pages is retained by Recoll in a
|
||||
local cache (from which previews can be fetched). The cache size can
|
||||
be adjusted from the <guilabel>Index configuration</guilabel> /
|
||||
<guilabel>Web history</guilabel> panel. Once the maximum size
|
||||
is reached, old pages are purged - both from the cache and the index
|
||||
- to make room for new ones, so you need to explicitly archive in
|
||||
some other place the pages that you want to keep
|
||||
indefinitely.</para>
|
||||
<para>You can find more details on Web indexing, its usage and configuration
|
||||
in a <ulink url="&FAQS;IndexWebHistory">Recoll 'Howto' entry</ulink>.</para>
|
||||
|
||||
<note><title>The cache is not an archive</title><para>A copy of
|
||||
the indexed Web pages is retained by Recoll in a local cache
|
||||
(from which data is fetched for previews, or when resetting the
|
||||
index). The cache has a maximum size, which can be adjusted from
|
||||
the <guilabel>Index configuration</guilabel> / <guilabel>Web
|
||||
history</guilabel> panel (<literal>webcachemaxmbs</literal>
|
||||
parameter in <filename>recoll.conf</filename>). Once the maximum
|
||||
size is reached, old pages are erased to make room for new ones.
|
||||
The pages which you want to keep indefinitely need to be
|
||||
explicitly archived elsewhere. Using a very high value for
|
||||
the cache size can avoid data erasure, but see the above 'Howto'
|
||||
page for more details and gotchas.</para></note>
|
||||
|
||||
</sect1>
|
||||
|
||||
@ -2475,7 +2470,7 @@ fs.inotify.max_user_watches=32768
|
||||
file inside the configuration directory. The sample file
|
||||
distributed with &RCL; (which you should be able to find under
|
||||
<filename>/usr/share/recoll/examples/fragbuts.xml</filename>),
|
||||
contains an example which filters the results from the WEB
|
||||
contains an example which filters the results from the Web
|
||||
history.</para>
|
||||
|
||||
|
||||
@ -2485,7 +2480,7 @@ fs.inotify.max_user_watches=32768
|
||||
<fragbuts version="1.0">
|
||||
|
||||
<radiobuttons>
|
||||
<!-- Actually useful: toggle WEB queue results inclusion -->
|
||||
<!-- Actually useful: toggle Web queue results inclusion -->
|
||||
<fragbut>
|
||||
<label>Include Web Results</label>
|
||||
<frag></frag>
|
||||
@ -6115,11 +6110,11 @@ hasextract = False
|
||||
<application>Debian/Ubuntu apt</application>,
|
||||
<application>FreeBSD</application> ports, etc.), or from some type
|
||||
of "backports" repository providing versions newer than the standard
|
||||
ones, or found on the &RCL; WEB site in some
|
||||
ones, or found on the &RCL; Web site in some
|
||||
cases. The most up-to-date information about Recoll packages can
|
||||
usually be found on the
|
||||
<ulink url="http://www.recoll.org/pages/download.html">
|
||||
<application>Recoll</application> WEB site downloads
|
||||
<application>Recoll</application> Web site downloads
|
||||
page</ulink></para>
|
||||
|
||||
<para>The &WIN; version of Recoll comes in a self-contained setup
|
||||
|
||||
@ -331,7 +331,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
|
||||
{
|
||||
ConfParamW *bparam = m_w->addParam(
|
||||
idx, ConfTabsW::CFPT_BOOL, "processwebqueue",
|
||||
tr("Process the WEB history queue"),
|
||||
tr("Process the Web history queue"),
|
||||
tr("Enables indexing Firefox visited pages.<br>"
|
||||
"(you need also install the Firefox Recoll plugin)"));
|
||||
ConfParamW *cparam = m_w->addParam(
|
||||
@ -353,6 +353,8 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
|
||||
"file (only waste space at the end)."
|
||||
), -1, 1000*1000); // Max 1TB...
|
||||
m_w->enableLink(bparam, cparam);
|
||||
m_w->addBlurb(idx, tr("Note: old pages will be erased to make space for "
|
||||
"new ones when the maximum size is reached"));
|
||||
m_w->endOfList(idx);
|
||||
return true;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user