From 879225d6879c9d8c0c382db93459d08b3be7f28d Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 31 Jan 2011 09:32:26 +0100 Subject: [PATCH] Added language-based helper for classifying iso-8859-x encodings --- .hgignore | 1 + src/filters/eulangclass.py | 81 ++++++++++++++ src/filters/iso8859stops.zip | Bin 0 -> 9694 bytes src/filters/rclkar | 200 ++++++++++++++++++++++++++--------- 4 files changed, 233 insertions(+), 49 deletions(-) create mode 100755 src/filters/eulangclass.py create mode 100644 src/filters/iso8859stops.zip diff --git a/.hgignore b/.hgignore index 39a601d9..4c5d3123 100644 --- a/.hgignore +++ b/.hgignore @@ -52,6 +52,7 @@ src/doc/user/usermanual.html src/doc/user/usermanual.html-text src/doc/user/usermanual.txt src/filters/rclexecm.pyc +src/filters/eulangclass.pyc src/index/alldeps src/index/recollindex src/lib/alldeps diff --git a/src/filters/eulangclass.py b/src/filters/eulangclass.py new file mode 100755 index 00000000..242f02d1 --- /dev/null +++ b/src/filters/eulangclass.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +import sys +import string +import glob +import os +import os.path +from zipfile import ZipFile + +class European8859TextClassifier: + def __init__(self, langzip): + self.langtables = self.readlanguages(langzip) + + # Table to translate from punctuation to spaces + punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r" + spaces = "" + for c in punct: + spaces += " " + self.spacetable = string.maketrans(punct, spaces) + + # Read the languages stopwords lists + def readlanguages(self, langzip): + zip = ZipFile(langzip) + langfiles = zip.namelist() + langs = [] + for fn in langfiles: + text = zip.read(fn) + words = text.split() + langcode = os.path.basename(fn) + langcode = os.path.splitext(langcode)[0] + (lang,code) = langcode.split('_') + langs.append((lang, code, words)) + return langs + + def classify(self, rawtext): + + # Remove punctuation + rawtext = rawtext.translate(self.spacetable) + # Split words + words = rawtext.split() + # Count frequencies + dict = {} + for w in words: + dict[w] = dict.get(w, 0) + 1 + # Order word list by frequency + lfreq = sorted(dict.iteritems(), \ + key=lambda entry: entry[1], reverse=True) + # Check the ntest most frequent words against the language lists and + # chose the best match + ntest = 10 + maxcount = 0 + maxlang = "" + maxcode = "" + for lang,code,lwords in self.langtables: + count = 0 + for w,c in lfreq[0:ntest]: + if w in lwords: + count += 1 + print "Lang %s code %s count %d" % (lang, code, count) + if maxcount < count: + maxlang = lang + maxcount = count + maxcode = code + # If match too bad, default to most common + if maxcount == 0: + maxlang,maxcode = ('english', 'cp1252') + return (maxlang, maxcode, maxcount) + + +if __name__ == "__main__": + f = open(sys.argv[1]) + rawtext = f.read() + f.close() + + dir = os.path.dirname(__file__) + langszip = os.path.join(dir, 'iso8859stops.zip') + + classifier = European8859TextClassifier(langszip) + + lang,code,count = classifier.classify(rawtext) + print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count) diff --git a/src/filters/iso8859stops.zip b/src/filters/iso8859stops.zip new file mode 100644 index 0000000000000000000000000000000000000000..0b0c64401df921742660194c8724108bb40db625 GIT binary patch literal 9694 zcmaKyWl&w~vb7h%-QC^Y-QC@TySqCCcb7nLcY+h#B{+fL?vg+Vns0H>t-JH>a8lF+ z)}NgUx z&cfj8>8h><4FHWA6_?*4kdSxtgav?r9fAS?|MN>-OE+$p3h8mZy|`F{B`j7}+<5f% z2K(M;2P7A}oR&n2s5hb0d(>*f#gD#?E;y)p_Xj0s3-S7>p8EAj2;x>7m{v(UoFGz# zPW<2q$?zvi6xKMXV~K}R8i$^)wOTV>9Kd_PhVI>y_`XK_!%lgx$xZd$huwVG*_y15`e0eMcIzfK>E2JjEGLF7$TiNcNw7 z=Y>=(>?n%Na`CV>@{U5+WKjoz9k3mn`ps*Y){d!fTqv8XUs^A$*~#=hh)vg4X#+n6 z6K4^P*b?-}*qqRs${lU#Q?lwsH_DadLHFrh-tpZ9jgfpj^GeK&2plD|Ok(C$*In_3 z?i^q!(2q3$nG;`F&$0JNN?uS$ck!}t`I`P5sWWsRVBc25BzHS*0zOiU4)PHTlG~V$ zISXY0EyOtEvr+j+-6lTE#a+#8iy-Z`J#AhTP+gc4pROB|Jn2Uw7nbn3DKn_ZaC8La zn7q_)i1_4Tj8ZqkbQulD<^d{Xq!?z~mCVKbvq)9|I-!ZU4GKyJw zA;%~bK(+aa!ezNzc=8bv6Eij0GF#R%hS&})r5$%VM$pyQ$4_DrI_sC6dU|vmFF*rM zg!2L3KIkD|u#qrps3JwEr{0WG8!=N^67LpHdmRCM>XYF+FkH@Te(lbJB&Z8ioESsD zIocGp*{O9+#PRqFszy57!;)N#n2i?>bDLL_&{z>-;vADxIzq+F0ux87J$48Y_9Q`= z0?BMy?4GNkvR0I{UN1M^@NCE_{9MQtDC0Qsj7tNLnw5kl{sj`&0#0tM^j42fwiyuy zeXO{s*K1miLH2!=zNe}8?sdW(owRw+=4A~jp$H{rN{4ik4NP3E`lRZ-RZ?M1!^uf z40ntI^;hl<#%>C~8+ZeFS?erZrS*%4N=D`7o?Z=SgaKb}jjDsbFtIY6Zj-r)z@!-t zdOtBzWBsy%RbCmwo}4Mrsm!!7)v3CbhJel#LU!jim9T)>nw~s~hFe0^;d-AH2KfCj z>GaRA4|DFdrw(FMn~U*Faj8LT z6yIZn%PAuXj1Ju|9y)f+JyuVxOeP|>0+o&6W$i)QGqL+$Fu~ijH5(E-{ zu(*jMNk6~yyMkzIwNv3TduNA;@8@+M4+wU^3HTSYp7M6uF%R&&T*wjB1GC|3dqLM7 z@fpxACJBsjipZPKW`gQjG^V6K#P!d0!ibimf_CVv6*N`K#11}x8XP_4tMqIY`)m^Dj5&^@1(BH?6^Amh=-R1iRS z4U6!7O16W+&GReGG{821@jIdSw&!BuqO^93vH3{ z23CGR*r(on{#wt32Js=7eJ(A}4DvkM&=9>|HH2GA)cHdRo(3i&Nk|QSLwM(4uLV2D zYMLkdE-mCm`on_h*@n30_q3bXA|kWgMqg^G8K{Qt1MNov zy9(iUHg6h&y&55i(KUbU7q$+EvIJX+R%)OJm(S%ZJrMF<5~a-RB9}%Uy}4LfCWxL{ zwp5#o->l6`DuxFKBkD(l@^${O3vtyn3`_*HX!%X%O(%|#m5-mi``NR4pg{^v zqBLblkFR6~f`}iM_heC+-?mdDW8|Oyq+5W++z0VySydE2I?7H2gIO1?UqcOw-pFE; zrz8Xpz`-Fb$;>yj?o}C8=+N)r1b0_%1`6u7;bfO{bjz@yMBhSETMbz`n+A{);dC zCr1D`PL%SKVkQAS9M6uoS-4t9+96t`*roFYh+l1}Yn&uKc<@%-}hws5nZYh*97f!9;BbNR2dU4fm zw+`&UGcR_k;4Q1gZjpx{E)f@|wwQO&P8zyLyHi6Okl3&giKL@8mF6^fm}rv2#=!-p ze2+4^G7kH+$!c+RWllA%b@kLg7gwi|5w`R~ChD6Ofti-hOds+;Eczv8 z$ha-eQi&Wir#;VQ*X%=(CY}h>S=X?>)Fro`C537{=FFbhd2al6uohxponh!v zGKOu$v2AO<28|`P%qX>_4^D^R62oR$C_=;gHmpJ^ZlJ*^bsa;>hZ(WdgNo=8)Lu+wLluyoB?CGak6oaX3CS&qA!#(o34vur>O`M!N zw~%Ent}Z_e3}m7e@Eg2E7m1CHSXXM20?F2qPK4j3B}c+*UpnvQtE`4={<6_!e=g+w z9`JlK$CJ^aCGTVi)68Rojxv#Ce&k|j8+uteT4vWkhCw^>ZnC<{sVk-WrlP5unW)t` zE{P;Rb@D`GipC>t0QODq7kr3j1gABnbF0M4L<#0SI|s>%snt({79u0*G8UF*N#+_Q zleRU!$6}^$L|Y+OvL~gWxD}w{A{~;;5V)81SB~ta#g5w4zT?NSo;=^pJl3u}tD-xu zUw%y+G^1F#tNQu%AYx+m`D?T%*g%UmcIaW{*VfB(P>0?|)#7b0!hv6IZQD-tG|rJh ztE`3#PL-6*Ag(pJ8Mf($TG1;S5@l<7qztNpM+uv?V%&#C?6P0KiQZe>q(*6|H&NWk z%Ty^mr;bfNyI}h+`!5uEN zLKh}(TeTNv9V~8Flk17nH`BIekQlQFpOvYZ9cB9z*v<*q;zJj*DrbC~tI=$t9VREQ z95+Wqbmg)sl}Sz{-!{S(p^5a=!hm$6eFdHJ{vmaZgnsl)f~j7LFLV zrg`&-J$D<=kQ~B$FCa+;bj8_EjHr?8ip=Y13>m}27%(&-@uKDKJ(O!kRL z*&5NEq5S;XMya8L;%!;ov^OKqXgJL!;y*3Kr$Rs=n;+4P8w3Clc>@5T0rv%a2WJm6 z3o9f0e{T%O!>=|G{6uB@ftM!2TY9(y> z9W4xp7i{$V{3=kf(o=%q(ve>LYQZPsZmlg@9#_OV56zBZ5km0os zZ-d-{*3puuNB?N>5fb+pyHWB$FLGbzthLQZ?c-6ZEr#=kLBWD)+aphNkB}$A$Fhz)nTtqem$3Z7z%nD=@iWJ1 zWAu00Wd;FNU|HVRzsW!jl%agtRMZM6!&jgTFKe5XtC8(1B1iy57`6CgZIf4#P3vcZ z(nT!IsGJEc={yKL*(fL_99^5B9pU;yv|Dj#)m2&+jiIG1Vs9BE*%}H&?uIDU`Z={2Yd`S(pr2dIEdJOyQ6!wP=jnPVM>U7xBXl{a}034Tmyp zv=A=Bw+$_Z8s>V`En{|dm79<6%?>3tGEiF4vUzdDjgz?umbrz=C?Ya5-e&}FU}R5vvYaiJ5sm2U7EapJL>?l6P(XxQ}{-{O9b914Ya6 z()?5a6ipdu!RSEI937lp-7MV9T+IG`c>G6RJv{s>!bz)4P<)KFUuun_Gl|Wh5v6M8 zpWSIn-`m*x$dGoZaIDdf6A3-q>Q~NiUeSDFxmv?(U89wpwUn08!wj+35A66jrSOQ? za3${4pvRW(c=8ORQ#5Ows=)wPwn89LsX_Oxk{G1tTt}rTysFe06Hf}fH=3@h8-BH& zQao`${zA{$ZzPmw#z1XeoHwLsE+TSR?=YYOBi&izRJg^p{Yn!D1%zf!zHU*5Qxz-g831NAr#`bfDggs-G1v>8 zWaz4{uRV!;ye*XoNETA1ovIPtVqz|P3d^rBR|g~+p`A6^!px~h_Ipq#O!ApRr^gn{z zzKmN|E)MiuTlexNn+x#Gvfa{mNU~Ch5$1F@Ks# zIP-(%TJ>nuPl7P|NTK^<&T;gc*sY9Vob!XK zw)5THy%TgG@L(1XF7~ZW%(-XA8~YwkXrGWAuPwctaSoD|UkoQE+&N-DO_?k%$TOk33j%htNNy*-p-qQ_Z_P~Lb2dzHB&ti zH%2a8OSrM;d(W%=QlIZbqEV8$D*MilT^m)4dC}!b(7x*FB=8)+KkT$+*pafi*@TFz zS2#0CXYV_I$i#28pbzn5NkL*~UZ)hb`d1Blm!(ae-G}cUYj}sQAF@G#f5wA8;T^&~ zE%$H?&-_e3p4^{?w6D)ss7D0&J^j{WA%N-;j}@I=1J!c{``>BV(de~k4DNNKQ&&|^ zIwgX7JV5hdPikeg6`zbKv5xE1^fai=&M050_#%%40xprLoH7o=F4W0DFI}sfTH{r} zLDJRx;fIF!?Qpk9ky*l&U|u&vZWfu6pR6_Ut_)jvyXuKPXc?seua0S0c@arC$IsY0 zNxRxaDTCT8T=$Wfrk)$y90KUJZK%a}wsDEgXj-EEt)>I9)g&AJbtswH zNV@dj=Fv;@?#qZF9J6gPi7nRDK2=Agx)H`fO|tAbj3T>L56+or!H#YoZzl=_2gDpc zF7_EEqkeXy5jfBBEkph6daN5pF(K-(hq8i2W^(vG+xu7`61-eRP;J5 z+M>VrYN+yD4b-S7%DDa*>R!_@5&o9e7P{6aaDTwMm0{%FVGzg+z_0_#uEj$SBa@A+ ziz<4f_SS9G#INDK(vOh?6f75_wpp8ebcWP;8Vl=e`{bX;EkzAF)?EtUqDwKR;6P-T zIKsSX?Vq)nZmCx!X&QvP4iR~3*hOhe8gz!}xb+%fiMWEI8H6w*M0QOSnP6+aFHxGb z2b*w+L={|E?(=aekL`srS#c$DS4G!jho6JR^=ZJ~Gvbbif|jwtPcN!{V895$_h`kU zmhW}g+CMCjZHiap5}R~QQp}wLKjP>?cGa9$eg^&L^#5!Y5P+gjJ8B2k4=k|Vzl=&I z-exAR9)8pHSGzsCifmjJ6Wr<&{n|hlbOM-g1e*QOyN){0^5xIEP1j@CvGWSNaf0e* zV6YHmo>f#|HVYUIId&;`Lzj+S9V~h(;|QMme10yOQ0E2Gjf{SCz9&HUVsH6&_;Gk2 zY%Yf(l=Ztj76F--HOtaW!!sWg!Bvwds||8)%pq^(6upf?Wo){}Bd!2igXblVnFVC# z;}mmafoypG!9bAoz_K51ZmBQ_gj{m2Vd8>1!BArygfuhNSJ}21UB&RSr@pQ9;S{`Y z_+Y}E>;6=#H5sNt#NLs)H#b7d*pc^UGRPqk?%hMI2qzZ?VVyLDAv%cRkvJ&tDW!rx zbR2N*_Q-OYIJ%s-W0#>2qAhrCaH_eR)p&UuEE(Z6HKI#=xCD1b!#IW)~O2c1>~ z9NMW;PtB85?9F0bvm(?T{)EOr_l{-orSAmplcB&UHpy&C^mD*JbwCS@*qyqapIm@} zdn52d0@i`4n=7ym{uQt@zaFro$0@@MFhTuiT&AnfW{d3`vL!0rfbJAdnNc1f;@l<2 z*Yrx@l17mdFAuCR`YxUpT7|h#CRD=PL4qh+exR(yLm<8vSz?+lRQUN36!#L>9{(IY z9|{pp;_)zV_Ol6#2aOkVThq@)6_q;o6VuJ#nxJX5fIw=%&#g*e5qAQM_+`*Ev$yyk z5Bt~G*PVQ|gvC{Kd;G)bbS#>O+R`&ZU+W(x&Lk)oXq@Vg;wQ$}=1 zSZ=Q~EQH{5a!1o{Bb5`h4}KJ?M$~h663qqq=G-{<4yLtfM)3xIqCL=LIW>+-k?-f~ zH)lGKDwsmnhme^2-ST}JGS?6nbM4w;1NBGS7_i$Y=P+_ly_yp7nq`Rrp&2h4!H7dF zZ#_A=R>Ue~WN1Xio-abCjY5{an>mP~zS44G-VYHbYE-jHWb@K=$pqk0!?WZ>91Hou ztET#7FyF(eVfT83pjAeUmFe;)1TyqF29hd7vJ!Z@h^Aq@z=hU>X+JKzeIiw%;AeZj zkw^Uo8R~rTg~wjV*u%34{l{+RRpY)JSu5-q*#vQeJe}a z4=_k*Z#l5*v^Pak3qyBUPtH9CLuZ0AnTvJM1UI6x$4vSJyH5l3hc@p? z9Z!l1a7kA3g}t)kGtH+Foi82&0=oaHSvjEhsUcqRZ~z8mC$wXnQ2<|;7+EmqZ=If*>QD^lm$kxzc<%1u7 zA6Ap}V|0h +^RpxUQFFx}u&hxv<&xZUQOKZGbrEnGdRSIC>Hsg;>ct68I>nKXVT zQ=G)H6o!ra5Q6b3OmIW>?I>N~=~CWRLJBqg6B-=OgXN4=Wkd~PY#eFfniKDt#6SX) zp_NJ2R!Rh8@)T+7qMVk{;Bu%Y&$~$fT#!@A-gC6szV4=XL`FtWEC+s}Ontg{XYl5A z#F)4p+trMn?2Yxt(IVWqL5aZcBsb!3n-+X^igZELl?a`4BvSM}{msmR&z4OcG3T%8 z-`WNWux)I2(Mtn>5^Mmk7YR&XojyQc?;G?=!Y>DpPqEAR)4qh-5C)}n_C$!!M%^$y zPqG&Hn=~OQs9|C#3^Q)TdsxAwMj{BYhK&AIT!hR^V`Ju$OhfTfequ(b8;GA$bST>y z!)|~;iMJcr|CXR;(#29+~bb$7l66Q%?jo7@lvPIQSR{88LHkr z7u)hzcxq`kKD1J|1MKeC&v2A1tKXP4T4yTGWBB+=Q}`~9Nq*#%Ec}Mdfe;(L8*S5@ zi>Trn&ub&1$bydZF+zGfA=ZawU(RT)+v#l`BI4u)kuNMKZtLW;#4Wr>-hrPHjN*H; z;HQZ}>Kv?Jwsd>kbvziP76bc9zt7==z}$0tA!gJR*nLBQ7Z&hVTDsX=7&*V}%dd>$ zEU%}Z@G%>6viHKGfi!mMXIRXsu6cWPr2^sh3)bF)WBHWP73b&Ek(0q8qfv+syM~36uIw4} zTUyHvJcDz?h6Qhd;wS&_88mk`v;WgL>|Z~F^hvTwt3+@@ONFWv3s}1P@|Y6#p?cB^ z+4?015X*a!a;ctg$nxX$v*6QFMlkq2&e5mxRcdA_=i#;RzZvN_vzV$Dp>J0GB>vjP z7cx$hjUWC@S?{woieJmGv{C9~zIqdr6?Ldzt4y=rJk1$s<*2r7K*%hvI|q(=Hex{nDyaZHvDndTvF9UDvgb(9>M+d4=765$poT22OQBA2W}XZhxuGl z!xLvi0c%a}s@c(AHiZ5k;=^4srAO(7&hBd@ElyVXqOkSd&MZ6_3qm&KdJ6dVT{h=b z$RpZ#Y_tm~frs#}N;_drUB_}?YR$VMYEc@FeYV{Pr>r=L~9yKhPw-3L-m^zNZVKrVeV9S z*~c3SP#4Os*$uv>`u_s_(=4Td9;q2jd-DV=tz)2A7{Ky!b#u1)|MooY|9g&3{kl)` zD@iN#Fv0QdXkKC?=yCPH>KT`%!jr?RJZwD%sv^d$1@C6olk#GRZ+IDPPc%7#NJqgi zymO3P%Cb^US%I@g^J*UfarzlZa2y&GR3|G5&54OBND$_T<>{sZ6Z}&spSJ;r*PD3f z!{r=DQc6k@pMdXiL6Z!`C%Cl=iAn_(ttGPS$8SHbvl2as-#g-a*<}5Qf-|2%-$G#O zMq*YOv)tMbp5mj*Qv)-y6W6Kh9Hn>Eo*#W%h5`2_>Lc0e&Z7a1snS{ @@ -49,53 +57,72 @@ class KarTextExtractor: self.currentindex = 0 self.encoding = "" self.defaultencoding = "" - self.acceptnulls = False + self.hadnulls = False + # Compute the fallback encoding to use if we can't determine + # one when processing the file. Based on the nls environment try: self.defaultencoding = sys.getfilesystemencoding() except: pass + if self.defaultencoding is None: self.defaultencoding = sys.getdefaultencoding() if not self.defaultencoding or \ self.defaultencoding.lower().find('ascii') != -1: - self.defaultencoding = 'latin_1' + self.defaultencoding = 'cp1252' + try: codecs.lookup(self.defaultencoding) except: - self.defaultencoding = 'latin_1' + self.defaultencoding = 'cp1252' + + + def nulltrunc(self, data): + '''Truncate data after 1st null byte. For messages with garbage after + a null byte. Must not be done for utf-16/32 of course''' - # Try to decode input binary string then encode to utf-8 for output - def reencode(self, data): - text = "" if not data: - return text + return data - # Some files have garbage data after a null byte. - if not self.acceptnulls: - firstnull = data.find(chr(0)) - if firstnull != -1: - data = data[0 : firstnull] - - try: - text = data.decode(self.encoding, 'ignore') - except Exception, err: - self.em.rclog("Decode failed: " + str(err)) - return "" - try: - text = text.encode('utf-8') - except Exception, err: - self.em.rclog("Encode failed: " + str(err)) - return "" + firstnull = data.find(chr(0)) + if firstnull != -1: + self.hadnulls = True + data = data[0 : firstnull] + return data + + + def reencode(self, data): + '''Decode from whatever encoding we think this file is using + and reencode as UTF-8''' + + # self.em.rclog("Reencoding from [%s] to UTF-8" % self.encoding) + + if data: + try: + data = data.decode(self.encoding, 'ignore') + except Exception, err: + self.em.rclog("Decode failed: " + str(err)) + return "" + try: + data = data.encode('utf-8') + except Exception, err: + self.em.rclog("Encode failed: " + str(err)) + return "" - text = self.em.htmlescape(text).replace("\n", "
\n") - return text + data = self.em.htmlescape(data).replace("\n", "
\n") + + return data + - # Some karaoke files have the encoding as part of the file name - # as 'some title (encoding).xxx' Not sure the whitespace before - # the '(' has to be there, so not relying on this def encodingfromfilename(self, fn): + '''Compute encoding from file name: some karaoke files have the + encoding as part of the file name as 'some title + (encoding).xxx'. This is not an established convention though, + just one our users could use if there is trouble with guessing + encodings''' + rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$' m = re.search(rexp, fn) if m: @@ -103,7 +130,43 @@ class KarTextExtractor: else: return "" + def chardet_detect(self, text): + encodconf = chardet.detect(text) + encoding = encodconf['encoding'] + confidence = encodconf['confidence'] + self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence)) + # chardet is awfully bad at detecting 8bit european + # encodings/languages and will mostly return iso-8859-2 for + # everything, which is a bad default (iso-8859-1/cp1252 being + # much more common). We use our own ad-hoc stopwords based + # module to try and improve + if encoding.lower() == 'iso-8859-2': + try: + import __main__ + dir = os.path.dirname(__main__.__file__) + langszip = os.path.join(dir, 'iso8859stops.zip') + f = open(langszip) + f.close() + except: + self.em.rclog("Can't the find the language stopwords zipfile") + return (encoding, confidence) + try: + classifier = eulangclass.European8859TextClassifier(langszip) + lang,code,count = classifier.classify(text) + self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \ + (lang, code, count)) + if count > 0: + confidence = 1.0 + encoding = code + except Exception, err: + self.em.rclog("stopwords-based classifier failed: %s" % err) + return (encoding, confidence) + + return (encoding, confidence) + + def extractone(self, params): + '''Process one file''' docdata = "" ok = False @@ -112,15 +175,13 @@ class KarTextExtractor: return (ok, docdata, "", rclexecm.RclExecM.eofnow) filename = params["filename:"] + # Character encoding from file name ? self.encoding = self.encodingfromfilename(filename) - try: - codecs.lookup(self.encoding) - except: - self.em.rclog("Encoding [%s] not found, defaulting to [%s]" % \ - (self.encoding, self.defaultencoding)) - self.encoding = self.defaultencoding - - self.acceptnulls = self.encoding.lower() in self.acceptnullencodings + if self.encoding: + try: + codecs.lookup(self.encoding) + except: + self.encoding = "" # Mimetype not used for now if not params.has_key("mimetype:"): @@ -128,41 +189,47 @@ class KarTextExtractor: else: mimetype = params["mimetype:"] + # Read in and midi-decode the file try: stream = midi.read_midifile(filename) except Exception, err: - self.em.rclog("extractone: extract failed: [%s]" % err) + self.em.rclog("extractone: midi extract failed: [%s]" % err) return (ok, docdata, "", rclexecm.RclExecM.eofnow) title = None author = None language = None lyrics = "" - + lyricsN = "" + self.hadnulls = False + for event in stream.iterevents(): edata = "" if isinstance(event, midi.TextMetaEvent): if not event.data: continue elif event.data[0] == '/' or event.data[0] == '\\': - edata += "\n" + event.data[1:] + edata = "\n" + event.data[1:] elif event.data[0] == '[' or event.data[0] == ']': - edata += event.data[1:] + edata = event.data[1:] elif event.data[0] == '@': if len(event.data) == 1: continue else: if event.data[1] == 'I': - edata += event.data[2:] + '\n' + edata = event.data[2:] + '\n' elif event.data[1] == 'L': - language = self.reencode(event.data[2:]) + language = self.nulltrunc(event.data[2:]) + languageN = event.data[2:] elif event.data[1] == 'T': if title is None: - title = self.reencode(event.data[2:]) + title = self.nulltrunc(event.data[2:]) + titleN = event.data[2:] elif author is None: - author = self.reencode(event.data[2:]) + author = self.nulltrunc(event.data[2:]) + authorN = event.data[2:] else: - edata += event.data + edata = event.data elif isinstance(event, midi.LryricsEvent) or \ isinstance(event, midi.TrackNameEvent): space = "" @@ -171,13 +238,44 @@ class KarTextExtractor: if not event.data: continue elif event.data[0] == '/' or event.data[0] == '\\': - edata += "\n" + event.data[1:] + nl + edata = "\n" + event.data[1:] + nl else: - edata += event.data + nl + edata = event.data + nl - lyrics += self.reencode(edata) + lyrics += self.nulltrunc(edata) + lyricsN += edata - + + # Try to guess the encoding. First do it with the data + # possibly containing nulls. If we get one of the accepted + # nullbyte encodings, go with this, else repeat with the + # de-nulled data + + # self.em.rclog("Lyrics length %d" % len(lyrics)) + + if self.encoding == "" and has_chardet: + if self.hadnulls: + (encoding, confidence) = self.chardet_detect(lyricsN) + # self.em.rclog("With nulls: chardet: enc [%s], conf %.2f" % \ + # (encoding, confidence)) + if confidence > 0.6 and \ + encoding.lower() in self.acceptnullencodings: + self.encoding = encoding + lyrics = lyricsN + title = titleN + author = authorN + if self.encoding == "": + (encoding, confidence) = self.chardet_detect(lyrics) + self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \ + (encoding, confidence)) + if confidence > 0.6: + self.encoding = encoding + + if self.encoding == "": + self.em.rclog("Encoding not guessed, defaulting to [%s]" % \ + (self.defaultencoding,)) + self.encoding = self.defaultencoding + if title is None: title = "" if author is None: @@ -185,6 +283,10 @@ class KarTextExtractor: if language is None: language = "" + title = self.reencode(title) + author = self.reencode(author) + lyrics = self.reencode(lyrics) + self.em.setmimetype("text/html") docdata = htmltemplate % (title, author, language, lyrics)