Add orgmodesubdocs recoll.conf parameter to switch rclorgmode from using whole text or creating level-1 subdocs (default is subdocs)

This commit is contained in:
Jean-Francois Dockes 2022-04-08 08:52:23 +02:00
parent 561592b618
commit f2b24cf22d
2 changed files with 37 additions and 8 deletions

View File

@ -1,19 +1,37 @@
#!/usr/bin/env python3
from __future__ import print_function
# Copyright (C) 2020-2022 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# Read an org-mode file, break it into "documents" along the separator lines
# and interface with recoll execm
'''Read an org-mode file, optionally break it into subdocs" along level 1 headings'''
import rclexecm
import sys
import re
import rclexecm
import rclconfig
import conftree
class OrgModeExtractor:
def __init__(self, em):
self.file = ""
self.contents = []
self.em = em
self.selftext = ""
self.docs = []
config = rclconfig.RclConfig()
self.createsubdocs = conftree.valToBool(config.getConfParam("orgmodesubdocs"))
def extractone(self, index):
if index >= len(self.docs):
@ -34,7 +52,6 @@ class OrgModeExtractor:
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.file = params["filename"]
try:
data = open(self.file, "rb").read()
except Exception as e:
@ -42,6 +59,9 @@ class OrgModeExtractor:
return False
self.currentindex = -1
if not self.createsubdocs:
self.selftext = data
return True
res = rb'''^\* '''
self.docs = re.compile(res, flags=re.MULTILINE).split(data)
@ -63,6 +83,8 @@ class OrgModeExtractor:
return self.extractone(index)
def getnext(self, params):
if not self.createsubdocs:
return (True, self.selftext, "", rclexecm.RclExecM.eofnext)
if self.currentindex == -1:
# Return "self" doc

View File

@ -1061,8 +1061,15 @@ snippetMaxPosWalk = 1000000
# </var>
abbyyocrcmd = /opt/ABBYYOCR11/abbyyocr11
# <grouptitle id="SPECLOCATIONS">Parameters set for specific
# locations</grouptitle>
# <grouptitle id="MISCHANDLERPARAMS">Parameters for specific handlers</grouptitle>
# <var name="orgmodesubdocs" type="bool">
# <brief>Index org-mode level 1 sections as separate sub-documents</brief>
# <descr>This is the default. If set to false, org-mode files will be indexed as plain text</descr>
# </var>
orgmodesubdocs = 1
# <grouptitle id="SPECLOCATIONS">Parameters set for specific locations</grouptitle>
# You could specify different parameters for a subdirectory like this:
#[~/hungariandocs/plain]