395 lines
12 KiB
Python
Executable File
395 lines
12 KiB
Python
Executable File
#!/usr/bin/env python2
|
|
|
|
# Audio tag filter for Recoll, using mutagen
|
|
|
|
import sys
|
|
import os
|
|
import rclexecm
|
|
import time
|
|
import datetime
|
|
import re
|
|
|
|
try:
|
|
import mutagen
|
|
from mutagen import File
|
|
from mutagen.id3 import ID3TimeStamp
|
|
except:
|
|
print("RECFILTERROR HELPERNOTFOUND python:mutagen")
|
|
sys.exit(1);
|
|
|
|
|
|
re_pairnum = re.compile(b'''\(([0-9]+),\s*([0-9]+)\)''')
|
|
|
|
# The 'Easy' mutagen tags conversions are incomplete. We do it ourselves.
|
|
# TPA,TPOS,disc DISCNUMBER/TOTALDISCS
|
|
# TRCK,TRK,trkn TRACKNUMBER/TOTALTRACKS
|
|
tagdict = {
|
|
'ALBUM ARTIST': 'ALBUMARTIST',
|
|
'ALBUM' : 'ALBUM',
|
|
'ALBUMARTIST' : 'ALBUMARTIST',
|
|
'ALBUMARTISTSORT' : 'ALBUMARTISTSORT',
|
|
'ALBUMSORT' : 'ALBUMSORT',
|
|
'ARTIST' : 'ARTIST',
|
|
'ARTISTSORT' : 'ARTISTSORT',
|
|
'BPM' : 'BPM',
|
|
'COM' : 'COMMENT',
|
|
'COMM' : 'COMMENT',
|
|
'COMMENT' : 'COMMENT',
|
|
'COMPILATION' : 'COMPILATION',
|
|
'COMPOSER' : 'COMPOSER',
|
|
'COMPOSERSORT' : 'COMPOSERSORT',
|
|
'CONDUCTOR' : 'CONDUCTOR',
|
|
'CONTENTGROUP' : 'CONTENTGROUP',
|
|
'COPYRIGHT' : 'COPYRIGHT',
|
|
'DATE' : 'DATE',
|
|
'DISCNUMBER' : 'DISCNUMBER',
|
|
'DISCSUBTITLE' : 'DISCSUBTITLE',
|
|
'DISCTOTAL' : 'TOTALDISCS',
|
|
'ENCODEDBY' : 'ENCODEDBY',
|
|
'ENSEMBLE' : 'ORCHESTRA',
|
|
'GENRE' : 'GENRE',
|
|
'ISRC' : 'ISRC',
|
|
'LABEL' : 'LABEL',
|
|
'LANGUAGE' : 'LANGUAGE',
|
|
'LYRICIST' : 'LYRICIST',
|
|
'LYRICS' : 'LYRICS',
|
|
'MOOD' : 'MOOD',
|
|
'ORCHESTRA' : 'ORCHESTRA',
|
|
'PERFORMER' : 'PERFORMER',
|
|
'ORIGINALARTIST' : 'ORIGINALARTIST',
|
|
'ORIGINALDATE' : 'ORIGINALDATE',
|
|
'RELEASEDATE' : 'RELEASEDATE',
|
|
'REMIXER' : 'REMIXER',
|
|
'SUBTITLE' : 'SUBTITLE',
|
|
'TAL' : 'ALBUM',
|
|
'TALB' : 'ALBUM',
|
|
'TBP' : 'BPM',
|
|
'TBPM' : 'BPM',
|
|
'TCM' : 'COMPOSER',
|
|
'TCMP' : 'COMPILATION',
|
|
'TCO' : 'GENRE',
|
|
'TCOM' : 'COMPOSER',
|
|
'TCON' : 'GENRE',
|
|
'TCOP' : 'COPYRIGHT',
|
|
'TCP' : 'COMPILATION',
|
|
'TCR' : 'COPYRIGHT',
|
|
'TDA' : 'DATE',
|
|
'TDAT' : 'DATE',
|
|
'TDOR' : 'ORIGINALDATE',
|
|
'TDRC' : 'DATE',
|
|
'TDRL' : 'RELEASEDATE',
|
|
'TEN' : 'ENCODEDBY',
|
|
'TENC' : 'ENCODEDBY',
|
|
'TEXT' : 'LYRICIST',
|
|
'TIT1' : 'CONTENTGROUP',
|
|
'TIT2' : 'TITLE',
|
|
'TIT3' : 'SUBTITLE',
|
|
'TITLE' : 'TITLE',
|
|
'TITLESORT' : 'TITLESORT',
|
|
'TLA' : 'LANGUAGE',
|
|
'TLAN' : 'LANGUAGE',
|
|
'TMOO' : 'MOOD',
|
|
'TOA' : 'ORIGINALARTIST',
|
|
'TOPE' : 'ORIGINALARTIST',
|
|
'TOR' : 'ORIGINALDATE',
|
|
'TORY' : 'ORIGINALDATE',
|
|
'TOTALDISCS' : 'TOTALDISCS',
|
|
'TOTALTRACKS' : 'TOTALTRACKS',
|
|
'TP1' : 'ARTIST',
|
|
'TP2' : 'ALBUMARTIST',
|
|
'TP3' : 'CONDUCTOR',
|
|
'TP4' : 'REMIXER',
|
|
'TPA' : 'DISCNUMBER',
|
|
'TPB' : 'LABEL',
|
|
'TPE1' : 'ARTIST',
|
|
'TPE2' : 'ALBUMARTIST',
|
|
'TPE3' : 'CONDUCTOR',
|
|
'TPE4' : 'REMIXER',
|
|
'TPOS' : 'DISCNUMBER',
|
|
'TPUB' : 'LABEL',
|
|
'TRACK' : 'TRACKNUMBER',
|
|
'TRACKNUM' : 'TRACKNUMBER',
|
|
'TRACKNUMBER' : 'TRACKNUMBER',
|
|
'TRACKTOTAL' : 'TOTALTRACKS',
|
|
'TRC' : 'ISRC',
|
|
'TRCK' : 'TRACKNUMBER',
|
|
'TRK' : 'TRACKNUMBER',
|
|
'TS2' : 'ALBUMARTISTSORT',
|
|
'TSA' : 'ALBUMSORT',
|
|
'TSC' : 'COMPOSERSORT',
|
|
'TSO2' : 'ALBUMARTISTSORT',
|
|
'TSOA' : 'ALBUMSORT',
|
|
'TSOC' : 'COMPOSERSORT',
|
|
'TSOP' : 'ARTISTSORT',
|
|
'TSOT' : 'TITLESORT',
|
|
'TSP' : 'ARTISTSORT',
|
|
'TSRC' : 'ISRC',
|
|
'TSST' : 'DISCSUBTITLE',
|
|
'TST' : 'TITLESORT',
|
|
'TT1' : 'CONTENTGROUP',
|
|
'TT2' : 'TITLE',
|
|
'TT3' : 'SUBTITLE',
|
|
'TXT' : 'LYRICIST',
|
|
'TXXX:ORCHESTRA' : 'ORCHESTRA',
|
|
'TYE' : 'DATE',
|
|
'TYER' : 'DATE',
|
|
'ULT' : 'LYRICS',
|
|
'USLT' : 'LYRICS',
|
|
'YEAR' : 'DATE',
|
|
'aART' : 'ALBUMARTIST',
|
|
'cond' : 'CONDUCTOR',
|
|
'cpil' : 'COMPILATION',
|
|
'cprt' : 'COPYRIGHT',
|
|
'disk' : 'DISCNUMBER',
|
|
'gnre' : 'GENRE',
|
|
'labl' : 'LABEL',
|
|
'soaa' : 'ALBUMARTISTSORT',
|
|
'soal' : 'ALBUMSORT',
|
|
'soar' : 'ARTISTSORT',
|
|
'soco' : 'COMPOSERSORT',
|
|
'sonm' : 'TITLESORT',
|
|
'tmpo' : 'BPM',
|
|
'trkn' : 'TRACKNUMBER',
|
|
'\xa9ART' : 'ARTIST',
|
|
'\xa9alb' : 'ALBUM',
|
|
'\xa9cmt' : 'COMMENT',
|
|
'\xa9con' : 'CONDUCTOR',
|
|
'\xa9day' : 'DATE',
|
|
'\xa9gen' : 'GENRE',
|
|
'\xa9grp' : 'CONTENTGROUP',
|
|
'\xa9lyr' : 'LYRICS',
|
|
'\xa9nam' : 'TITLE',
|
|
'\xa9ope' : 'ORIGINALARTIST',
|
|
'\xa9too' : 'ENCODEDBY',
|
|
'\xa9wrt' : 'COMPOSER',
|
|
}
|
|
|
|
# mp3: album, title, artist, genre, date, tracknumber
|
|
# flac: album, title, artist, genre, xxx, tracknumber
|
|
# oggvorbis:album, title, artist, genre, date, tracknumber
|
|
class AudioTagExtractor:
|
|
def __init__(self, em):
|
|
self.em = em
|
|
self.currentindex = 0
|
|
|
|
def _showMutaInfo(self, mutf):
|
|
self.em.rclog("%s" % mutf.info.pprint())
|
|
for prop in dir(mutf.info):
|
|
self.em.rclog("mutinfo: %s -> %s" %
|
|
(prop, getattr( mutf.info, prop)))
|
|
|
|
|
|
def _printableFilename(self):
|
|
return self.filename.decode('utf-8', errors='replace')
|
|
|
|
|
|
def _embeddedImageFormat(self, mutf):
|
|
#self.em.rclog("_embeddedImage: MIME: %s"%mutf.mime)
|
|
if 'audio/mp3' in mutf.mime:
|
|
for tagname in mutf.keys():
|
|
if tagname.startswith('APIC:'):
|
|
#self.em.rclog("mp3 img: %s" % mutf[tagname].mime)
|
|
return 'jpg' if mutf[tagname].mime == 'image/jpeg' else 'png'
|
|
elif 'audio/x-flac' in mutf.mime:
|
|
if mutf.pictures:
|
|
return 'jpg' if mutf.pictures[0].mime == 'image/jpeg' else 'png'
|
|
elif 'audio/mp4' in mutf.mime:
|
|
if 'covr' in mutf.keys():
|
|
format = mutf['covr'][0].imageformat
|
|
if format == mutagen.mp4.AtomDataType.JPEG:
|
|
return 'jpg'
|
|
else:
|
|
return 'png'
|
|
return ''
|
|
|
|
# Date formats found in actual files (any date field): [1961] [1967-01-01]
|
|
# [1996-11-04T08:00:00Z] [] [0000] [1994-08-08 07:00]
|
|
# We don't try to process the time part.
|
|
# The method translates the date into a Unix timestamp
|
|
# which means possible trouble for pre-1970 recordings (negative time).
|
|
# Oldest possible date with 32 bits time stamp is 1901, which is ok though.
|
|
#
|
|
# This is not used as we don't try to set dmtime (which would not
|
|
# be used by the current indexer anyway). We instead set a 'date'
|
|
# metadata entry.
|
|
def parsedate(self, dt):
|
|
if len(dt) > 10:
|
|
dt = dt[0:10]
|
|
l = dt.split('-')
|
|
if len(l) > 3 or len(l) == 2 or len(l[0]) != 4 or l[0] == '0000':
|
|
return ''
|
|
if len(l) == 1:
|
|
pdt = datetime.datetime.strptime(dt, "%Y")
|
|
elif len(l) == 3:
|
|
pdt = datetime.datetime.strptime(dt, "%Y-%m-%d")
|
|
val = time.mktime(pdt.timetuple())
|
|
return "%d" % val
|
|
|
|
def extractone(self, params):
|
|
#self.em.rclog("extractone %s %s" % (params["filename:"],
|
|
# params["mimetype:"]))
|
|
docdata = ""
|
|
ok = False
|
|
if not "mimetype:" in params or not "filename:" in params:
|
|
self.em.rclog("extractone: no mime or file name")
|
|
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
|
filename = params["filename:"]
|
|
mimetype = params["mimetype:"]
|
|
self.filename = filename
|
|
try:
|
|
mutf = File(filename)
|
|
except Exception as err:
|
|
self.em.rclog("extractone: extract failed: [%s]" % err)
|
|
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
|
|
|
#self._showMutaInfo(mutf)
|
|
|
|
###################
|
|
# Extract audio parameters. Not all file types supply all or
|
|
# even use the same property names...
|
|
minf = {}
|
|
for prop,dflt in [('sample_rate', 44100), ('channels', 2),
|
|
('length', 0), ('bitrate', 0)]:
|
|
try:
|
|
minf[prop] = getattr(mutf.info, prop)
|
|
except Exception as e:
|
|
#self.em.rclog("NO %s prop: %s" % (prop, e))
|
|
minf[prop] = dflt
|
|
|
|
if minf['bitrate'] == 0 and minf['length'] > 0:
|
|
br = int(os.path.getsize(filename)* 8 / minf['length'])
|
|
minf['bitrate'] = str(br)
|
|
|
|
minf['duration'] = minf['length']
|
|
del minf['length']
|
|
|
|
# Bits/samp is named sample_size or bits_per_sample (depend on file tp)
|
|
try:
|
|
minf['bits_per_sample'] = getattr(mutf.info, 'bits_per_sample')
|
|
except:
|
|
try:
|
|
minf['bits_per_sample'] = getattr(mutf.info, 'sample_size')
|
|
except:
|
|
#self.em.rclog("using default bits_per_sample")
|
|
minf['bits_per_sample'] = 16
|
|
|
|
for tag,val in minf.items():
|
|
minf[tag] = str(val)
|
|
|
|
#self.em.rclog("minf after audio %s\n" % minf)
|
|
|
|
####################
|
|
# Metadata tags. The names vary depending on the file type. We
|
|
# just have a big translation dictionary for all
|
|
for tag,val in mutf.items():
|
|
#self.em.rclog("Original tag: <%s>, val <%s>" % (tag, val))
|
|
if tag.upper() in tagdict:
|
|
tag = tag.upper()
|
|
if tag in tagdict:
|
|
ntag = tagdict[tag].lower()
|
|
#self.em.rclog("New tag: %s" % ntag)
|
|
try:
|
|
if isinstance(val, bool):
|
|
val0 = str(val)
|
|
else:
|
|
try:
|
|
val0 = val[0]
|
|
except:
|
|
val0 = val
|
|
if val0:
|
|
if type(val0) == type(u""):
|
|
val0 = val0.encode('utf-8', errors='replace')
|
|
else:
|
|
val0 = str(val0)
|
|
minf[ntag] = val0
|
|
#self.em.rclog("Tag %s -> %s" % (ntag, val0))
|
|
except Exception as err:
|
|
self.em.rclog("Error while extracting tag: %s"%err)
|
|
else:
|
|
#self.em.rclog("Unprocessed tag: %s, value %s"%(tag,val))
|
|
pass
|
|
|
|
# TPA,TPOS,disc DISCNUMBER/TOTALDISCS
|
|
# TRCK,TRK,trkn TRACKNUMBER/TOTALTRACKS
|
|
for what in ('disc', 'track'):
|
|
k = what + 'number'
|
|
if k in minf:
|
|
l = minf[k]
|
|
if not isinstance(l, tuple):
|
|
mo = re_pairnum.match(l)
|
|
if mo:
|
|
l = (mo.group(1), mo.group(2))
|
|
else:
|
|
l = l.split(b'/')
|
|
else:
|
|
self.em.rclog("l is tuple: %s" %l)
|
|
if len(l) == 2:
|
|
minf[k] = str(l[0])
|
|
#self.em.rclog("minf[%s] = %s" % (k, minf[k]))
|
|
if l[1] != 0:
|
|
minf['total' + what + 's'] = str(l[1])
|
|
|
|
if 'orchestra' in minf:
|
|
val = minf['orchestra']
|
|
if val.startswith('orchestra='):
|
|
minf['orchestra'] = val[10:]
|
|
|
|
#self.em.rclog("minf after tags %s\n" % minf)
|
|
|
|
# Check for embedded image. We just set a flag.
|
|
embdimg = self._embeddedImageFormat(mutf)
|
|
if embdimg:
|
|
#self.em.rclog("Embedded image format: %s" % embdimg)
|
|
minf["embdimg"] = embdimg
|
|
|
|
self.em.setmimetype("text/plain")
|
|
self.em.setfield("charset", 'utf-8')
|
|
|
|
for tag,val in minf.items():
|
|
#self.em.rclog("%s -> %s" % (tag, val))
|
|
self.em.setfield(tag, val)
|
|
# Compat with old version
|
|
if tag == 'artist':
|
|
self.em.setfield('author', val)
|
|
|
|
try:
|
|
docdata = mutf.pprint().encode('utf-8', errors='replace')
|
|
except Exception as err:
|
|
self.em.rclog("Doc pprint error: %s" % err)
|
|
|
|
ok = True
|
|
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
|
|
|
|
|
|
###### File type handler api, used by rclexecm ---------->
|
|
def openfile(self, params):
|
|
self.currentindex = 0
|
|
return True
|
|
|
|
|
|
def getipath(self, params):
|
|
return self.extractone(params)
|
|
|
|
|
|
def getnext(self, params):
|
|
if self.currentindex >= 1:
|
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
|
else:
|
|
ret= self.extractone(params)
|
|
self.currentindex += 1
|
|
return ret
|
|
|
|
|
|
def makeObject():
|
|
print("makeObject");
|
|
proto = rclexecm.RclExecM()
|
|
print("makeObject: rclexecm ok");
|
|
extract = AudioTagExtractor(proto)
|
|
return 17
|
|
|
|
|
|
if __name__ == '__main__':
|
|
proto = rclexecm.RclExecM()
|
|
extract = AudioTagExtractor(proto)
|
|
rclexecm.main(proto, extract)
|