recoll/src/filters/rclaudio
2019-04-13 14:23:55 +02:00

430 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
# Audio tag extractor for Recoll, using mutagen
import sys
import os
import rclexecm
from rclbasehandler import RclBaseHandler
import time
import datetime
import re
import rclconfig
try:
import mutagen
from mutagen import File
from mutagen.id3 import ID3TimeStamp
except:
print("RECFILTERROR HELPERNOTFOUND python3:mutagen")
sys.exit(1);
re_pairnum = re.compile(b'''[[(]*([0-9]+),\s*([0-9]+)''')
# The 'Easy' mutagen tags conversions are incomplete. We do it ourselves.
# TPA,TPOS,disc DISCNUMBER/TOTALDISCS
# TRCK,TRK,trkn TRACKNUMBER/TOTALTRACKS
# The conversions here are consistent with the ones in MinimServer (2019-03),
# including the rating stuff and TXXX. Lacking: Itunes '----' handling ?
tagdict = {
'ALBUM ARTIST': 'ALBUMARTIST',
'ALBUM' : 'ALBUM',
'ALBUMARTIST' : 'ALBUMARTIST',
'ALBUMARTISTSORT' : 'ALBUMARTISTSORT',
'ALBUMSORT' : 'ALBUMSORT',
'ARTIST' : 'ARTIST',
'ARTISTSORT' : 'ARTISTSORT',
'BPM' : 'BPM',
'COM' : 'COMMENT',
'COMM' : 'COMMENT',
'COMMENT' : 'COMMENT',
'COMPILATION' : 'COMPILATION',
'COMPOSER' : 'COMPOSER',
'COMPOSERSORT' : 'COMPOSERSORT',
'CONDUCTOR' : 'CONDUCTOR',
'CONTENTGROUP' : 'CONTENTGROUP',
'COPYRIGHT' : 'COPYRIGHT',
'DATE' : 'DATE',
'DISCNUMBER' : 'DISCNUMBER',
'DISCSUBTITLE' : 'DISCSUBTITLE',
'DISCTOTAL' : 'TOTALDISCS',
'ENCODEDBY' : 'ENCODEDBY',
'ENSEMBLE' : 'ORCHESTRA',
'GENRE' : 'GENRE',
'ISRC' : 'ISRC',
'LABEL' : 'LABEL',
'LANGUAGE' : 'LANGUAGE',
'LYRICIST' : 'LYRICIST',
'LYRICS' : 'LYRICS',
'MOOD' : 'MOOD',
'ORCHESTRA' : 'ORCHESTRA',
'PERFORMER' : 'PERFORMER',
'POP' : 'RATING1',
'POPM' : 'RATING1',
'ORIGINALARTIST' : 'ORIGINALARTIST',
'ORIGINALDATE' : 'ORIGINALDATE',
'RELEASEDATE' : 'RELEASEDATE',
'REMIXER' : 'REMIXER',
'SUBTITLE' : 'SUBTITLE',
'TAL' : 'ALBUM',
'TALB' : 'ALBUM',
'TBP' : 'BPM',
'TBPM' : 'BPM',
'TCM' : 'COMPOSER',
'TCMP' : 'COMPILATION',
'TCO' : 'GENRE',
'TCOM' : 'COMPOSER',
'TCON' : 'GENRE',
'TCOP' : 'COPYRIGHT',
'TCP' : 'COMPILATION',
'TCR' : 'COPYRIGHT',
'TDA' : 'DATE',
'TDAT' : 'DATE',
'TDOR' : 'ORIGINALDATE',
'TDRC' : 'DATE',
'TDRL' : 'RELEASEDATE',
'TEN' : 'ENCODEDBY',
'TENC' : 'ENCODEDBY',
'TEXT' : 'LYRICIST',
'TIT1' : 'CONTENTGROUP',
'TIT2' : 'TITLE',
'TIT3' : 'SUBTITLE',
'TITLE' : 'TITLE',
'TITLESORT' : 'TITLESORT',
'TLA' : 'LANGUAGE',
'TLAN' : 'LANGUAGE',
'TMOO' : 'MOOD',
'TOA' : 'ORIGINALARTIST',
'TOPE' : 'ORIGINALARTIST',
'TOR' : 'ORIGINALDATE',
'TORY' : 'ORIGINALDATE',
'TOTALDISCS' : 'TOTALDISCS',
'TOTALTRACKS' : 'TOTALTRACKS',
'TP1' : 'ARTIST',
'TP2' : 'ALBUMARTIST',
'TP3' : 'CONDUCTOR',
'TP4' : 'REMIXER',
'TPA' : 'DISCNUMBER',
'TPB' : 'LABEL',
'TPE1' : 'ARTIST',
'TPE2' : 'ALBUMARTIST',
'TPE3' : 'CONDUCTOR',
'TPE4' : 'REMIXER',
'TPOS' : 'DISCNUMBER',
'TPUB' : 'LABEL',
'TRACK' : 'TRACKNUMBER',
'TRACKNUM' : 'TRACKNUMBER',
'TRACKNUMBER' : 'TRACKNUMBER',
'TRACKTOTAL' : 'TOTALTRACKS',
'TRC' : 'ISRC',
'TRCK' : 'TRACKNUMBER',
'TRK' : 'TRACKNUMBER',
'TS2' : 'ALBUMARTISTSORT',
'TSA' : 'ALBUMSORT',
'TSC' : 'COMPOSERSORT',
'TSO2' : 'ALBUMARTISTSORT',
'TSOA' : 'ALBUMSORT',
'TSOC' : 'COMPOSERSORT',
'TSOP' : 'ARTISTSORT',
'TSOT' : 'TITLESORT',
'TSP' : 'ARTISTSORT',
'TSRC' : 'ISRC',
'TSST' : 'DISCSUBTITLE',
'TST' : 'TITLESORT',
'TT1' : 'CONTENTGROUP',
'TT2' : 'TITLE',
'TT3' : 'SUBTITLE',
'TXT' : 'LYRICIST',
'TXXX:ORCHESTRA' : 'ORCHESTRA',
'TXX:ORCHESTRA' : 'ORCHESTRA',
'TYE' : 'DATE',
'TYER' : 'DATE',
'ULT' : 'LYRICS',
'USLT' : 'LYRICS',
'YEAR' : 'DATE',
'aART' : 'ALBUMARTIST',
'cond' : 'CONDUCTOR',
'cpil' : 'COMPILATION',
'cprt' : 'COPYRIGHT',
'disk' : 'DISCNUMBER',
'gnre' : 'GENRE',
'labl' : 'LABEL',
'soaa' : 'ALBUMARTISTSORT',
'soal' : 'ALBUMSORT',
'soar' : 'ARTISTSORT',
'soco' : 'COMPOSERSORT',
'sonm' : 'TITLESORT',
'tmpo' : 'BPM',
'trkn' : 'TRACKNUMBER',
'\xa9ART' : 'ARTIST',
'\xa9alb' : 'ALBUM',
'\xa9cmt' : 'COMMENT',
'\xa9con' : 'CONDUCTOR',
'\xa9day' : 'DATE',
'\xa9gen' : 'GENRE',
'\xa9grp' : 'CONTENTGROUP',
'\xa9lyr' : 'LYRICS',
'\xa9nam' : 'TITLE',
'\xa9ope' : 'ORIGINALARTIST',
'\xa9too' : 'ENCODEDBY',
'\xa9wrt' : 'COMPOSER',
}
def tobytes(s):
if type(s) == type(b''):
return s
if type(s) != type(u''):
s = str(s)
return s.encode('utf-8', errors='replace')
# mp3: album, title, artist, genre, date, tracknumber
# flac: album, title, artist, genre, xxx, tracknumber
# oggvorbis:album, title, artist, genre, date, tracknumber
class AudioTagExtractor(RclBaseHandler):
def __init__(self, em):
super(AudioTagExtractor, self).__init__(em)
config = rclconfig.RclConfig()
tagfixerfn = config.getConfParam("audiotagfixerscript")
self.tagfix = None
if tagfixerfn:
import runpy
try:
d = runpy.run_path(tagfixerfn)
self.tagfix = d['tagfix']
self.tagfix()
except Exception as ex:
#self.em.rclog("tagfix script import failed: %s" % ex)
pass
def _showMutaInfo(self, mutf):
self.em.rclog("%s" % mutf.info.pprint())
for prop in dir(mutf.info):
self.em.rclog("mutinfo: %s -> %s" %
(prop, getattr( mutf.info, prop)))
def _fixrating(self, minf):
if 'RATING1' in minf:
if not 'RATING' in minf:
val = int(minf['RATING1']) // 51 + 1
if val > 5:
val = 5
if val < 1:
val = 1
minf['RATING'] = str(val)
del minf['RATING1']
def _embeddedImageFormat(self, mutf):
#self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime)
if 'audio/mp3' in mutf.mime:
for tagname in mutf.keys():
if tagname.startswith('APIC:'):
#self.em.rclog("mp3 img: %s" % mutf[tagname].mime)
return 'jpg' if mutf[tagname].mime == 'image/jpeg' else 'png'
elif 'audio/x-flac' in mutf.mime:
if mutf.pictures:
return 'jpg' if mutf.pictures[0].mime == 'image/jpeg' else 'png'
elif 'audio/mp4' in mutf.mime:
if 'covr' in mutf.keys():
format = mutf['covr'][0].imageformat
if format == mutagen.mp4.AtomDataType.JPEG:
return 'jpg'
else:
return 'png'
return ''
# Date formats found in actual files (any date field): [1961] [1967-01-01]
# [1996-11-04T08:00:00Z] [] [0000] [1994-08-08 07:00]
# We don't try to process the time part.
# The method translates the date into a Unix timestamp
# which means possible trouble for pre-1970 recordings (negative time).
# Oldest possible date with 32 bits time stamp is 1901, which is ok though.
#
# This is not used as we don't try to set dmtime (which would not
# be used by the current indexer anyway). We instead set a 'date'
# metadata entry.
def parsedate(self, dt):
if len(dt) > 10:
dt = dt[0:10]
l = dt.split('-')
if len(l) > 3 or len(l) == 2 or len(l[0]) != 4 or l[0] == '0000':
return ''
if len(l) == 1:
pdt = datetime.datetime.strptime(dt, "%Y")
elif len(l) == 3:
pdt = datetime.datetime.strptime(dt, "%Y-%m-%d")
val = time.mktime(pdt.timetuple())
return "%d" % val
def html_text(self, filename):
if not self.inputmimetype:
raise Exception("html_text: input MIME type not set")
mimetype = self.inputmimetype
# We actually output text/plain
self.outputmimetype = 'text/plain'
mutf = None
msg = ''
strex = ''
try:
mutf = File(filename)
except Exception as ex:
strex = str(ex)
if not mutf:
# Note: mutagen will fail the open (and raise) for a valid
# file with no tags. Maybe we should just return an empty
# text in this case? We seem to get an empty str(ex) in
# this case, and a non empty one for, e.g. permission
# denied, but I am not sure that the emptiness will be
# consistent for all file types. The point of detecting
# this would be to avoid error messages and useless
# retries.
if not strex:
return b''
else:
raise Exception("Open failed: %s" % strex)
#self._showMutaInfo(mutf)
###################
# Extract audio parameters. Not all file types supply all or
# even use the same property names...
# minf has natural str keys, and encoded values
minf = {}
for prop,dflt in [('sample_rate', 44100), ('channels', 2),
('length', 0), ('bitrate', 0)]:
try:
minf[prop] = getattr(mutf.info, prop)
except Exception as e:
#self.em.rclog("NO %s prop: %s" % (prop, e))
minf[prop] = dflt
if minf['bitrate'] == 0 and minf['length'] > 0:
br = int(os.path.getsize(filename)* 8 / minf['length'])
minf['bitrate'] = br
minf['duration'] = minf['length']
del minf['length']
# Bits/samp is named sample_size or bits_per_sample (depend on file tp)
try:
minf['bits_per_sample'] = getattr(mutf.info, 'bits_per_sample')
except:
try:
minf['bits_per_sample'] = getattr(mutf.info, 'sample_size')
except:
#self.em.rclog("using default bits_per_sample")
minf['bits_per_sample'] = 16
for tag,val in minf.items():
minf[tag] = tobytes(val)
####################
# Metadata tags. The names vary depending on the file type. We
# just have a big translation dictionary for all
for tag,val in mutf.items():
if tag.find('TXXX:') == 0:
tag = tag[5:].upper()
elif tag.find('TXX:') == 0:
tag = tag[4:].upper()
elif tag.upper() in tagdict:
tag = tag.upper()
if tag in tagdict:
#self.em.rclog("Original tag: <%s>, type0 %s val <%s>" %
# (tag, type(val), val))
# Some file types return lists of value (e.g. FLAC)
try:
val = " ".join(val)
#self.em.rclog("Joined tag: <%s>, type0 %s val <%s>" %
# (tag, type(val), val))
except:
pass
ntag = tagdict[tag].lower()
#self.em.rclog("New tag: %s" % ntag)
try:
minf[ntag] = tobytes(val)
#self.em.rclog("Tag %s -> %s" % (ntag, val))
except Exception as err:
self.em.rclog("Error while extracting tag: %s"%err)
else:
#self.em.rclog("Unprocessed tag: %s, value %s"%(tag,val))
pass
self._fixrating(minf)
#self.em.rclog("minf after extract %s\n" % minf)
# TPA,TPOS,disc DISCNUMBER/TOTALDISCS
# TRCK,TRK,trkn TRACKNUMBER/TOTALTRACKS
for what in ('disc', 'track'):
k = what + 'number'
if k in minf:
l = minf[k]
if not isinstance(l, tuple):
mo = re_pairnum.match(l)
if mo:
l = (mo.group(1), mo.group(2))
else:
l = l.split(b'/')
else:
self.em.rclog("l is tuple: %s tp1 %s tp2 %S" %
(l, type(l[0]), type(l[1])))
if len(l) == 2:
minf[k] = l[0]
#self.em.rclog("minf[%s] = %s" % (k, minf[k]))
if l[1] != 0:
minf['total' + what + 's'] = l[1]
#self.em.rclog("%s finally: %s" %(k,minf[k]))
if 'orchestra' in minf:
val = minf['orchestra']
if val.startswith(b'orchestra='):
minf['orchestra'] = val[10:]
#self.em.rclog("minf after tags %s\n" % minf)
# Check for embedded image. We just set a flag.
embdimg = self._embeddedImageFormat(mutf)
if embdimg:
#self.em.rclog("Embedded image format: %s" % embdimg)
minf['embdimg'] = tobytes(embdimg)
self.em.setfield("charset", 'utf-8')
if self.tagfix:
self.tagfix(minf)
for tag,val in minf.items():
#self.em.rclog("%s -> %s" % (tag, val))
self.em.setfield(tag, val)
# Compat with old version
if tag == 'artist':
self.em.setfield('author', val)
try:
docdata = tobytes(mutf.pprint())
except Exception as err:
docdata = ""
self.em.rclog("Doc pprint error: %s" % err)
return docdata
def makeObject():
print("makeObject");
proto = rclexecm.RclExecM()
print("makeObject: rclexecm ok");
extract = AudioTagExtractor(proto)
return 17
if __name__ == '__main__':
proto = rclexecm.RclExecM()
extract = AudioTagExtractor(proto)
rclexecm.main(proto, extract)