1 # -*- coding: utf-8 -*-
4 Inspiré de l'UnicodeSplitter de Plone 2.1, avec un aller-retour Unicode <-> UTF-8 pour le découpage.
6 $Id: Utf8Splitter.py 16 2009-08-31 11:36:17Z pin $
7 $URL: http://svn.cri.ensmp.fr/svn/Utf8Splitter/trunk/Utf8Splitter.py $
13 from htmlentitydefs
import name2codepoint
14 from unicodedata
import decomposition
15 from string
import printable
17 from types
import UnicodeType
18 console
= logging
.getLogger('Utf8Splitter')
21 from Products
.ZCTextIndex
.ISplitter
import ISplitter
22 from Products
.ZCTextIndex
.PipelineFactory
import element_factory
24 rx
= re
.compile(r
"\w+", re
.UNICODE
)
25 rxGlob
= re
.compile(r
"\w+[\w*?]*", re
.UNICODE
)
27 rtag
= re
.compile(r
"<[^<>]*>")
28 rent
= re
.compile(r
"&(?P<entName>[A-Za-z]+);")
30 _printable
= dict([(c
, True) for c
in printable
])
31 isPrintable
= _printable
.has_key
34 """Plain-text UTF-8 whitespace splitter
36 __implements__
= ISplitter
38 def process(self
, lst
, wordpat
=rx
):
41 result
+= wordpat
.findall(unicode(s
, 'utf-8', errors
='ignore'))
42 return [r
.encode('utf-8') for r
in result
]
44 def processGlob(self
, lst
):
45 return self
.process(lst
, rxGlob
)
49 class Utf8HTMLAwareSplitter
:
50 """HTML-aware UTF-8 whitespace splitter
52 __implements__
= ISplitter
54 def process(self
, lst
, wordpat
=rx
):
58 s
= rent
.sub(_convertEnt
, s
)
59 s
= s
.decode('utf-8', 'ignore')
61 result
+= wordpat
.findall(s
)
63 return [r
.encode('utf-8') for r
in result
]
65 def processGlob(self
, lst
):
66 return self
.process(lst
, rxGlob
)
70 class DesaccUtf8Splitter(Utf8Splitter
):
71 """Plain-text UTF-8 whitespace splitter with accents removal
73 def process(self
, lst
, wordpat
=rx
):
74 return Utf8Splitter
.process(self
, [_desacc(s
) for s
in lst
], wordpat
)
78 class DesaccUtf8HTMLAwareSplitter(Utf8HTMLAwareSplitter
):
79 """HTML-aware UTF-8 whitespace splitter with accents removal
81 def process(self
, lst
, wordpat
=rx
):
85 s
= rent
.sub(_convertEnt
, s
)
88 result
+= wordpat
.findall(s
)
90 return [r
.encode('utf-8') for r
in result
]
92 class _Utf8Utils(object) :
97 if cls
._singleton
is None :
98 cls
._singleton
= object.__new
__(cls
)
107 """Conversion d'une entité HTML en sa représentation UTF-8
109 return unichr(name2codepoint
.get(m
.group('entName'), 32)).encode('utf-8')
111 def udesacc(self
, uchaine
) :
114 ret
.append(self
._cache
.get(uc
) or self
._recurseDecomposition
(uc
))
118 def desacc(self
, chaine
):
119 """Désaccentuation d'une chaîne UTF-8
122 uchaine
= chaine
.decode('utf-8', 'ignore')
123 except UnicodeEncodeError :
124 if type(chaine
) == UnicodeType
:
125 console
.warn('already unicode value passed to desacc: %r' % chaine
)
129 ret
= self
.udesacc(uchaine
)
130 return ret
.encode('utf-8')
133 def _recurseDecomposition(self
, uc
):
134 deco
= decomposition(uc
).split()
139 if code
.startswith('<') :
141 c
= unichr(int(code
, 16))
142 subDeco
= decomposition(c
).split()
151 fullDeco
= u
''.join(filter(lambda c
: isPrintable(c
), fullDeco
))
152 self
._cache
[uc
] = fullDeco
155 Utf8Utils
= _Utf8Utils()
157 _desacc
= Utf8Utils
.desacc
158 _convertEnt
= Utf8Utils
.convertEnt
161 element_factory
.registerFactory( 'Word Splitter',
162 'UTF-8 Whitespace splitter', Utf8Splitter
)
164 element_factory
.registerFactory( 'Word Splitter',
165 'UTF-8 HTML Aware splitter', Utf8HTMLAwareSplitter
)
167 element_factory
.registerFactory( 'Word Splitter',
168 'UTF-8 Whitespace splitter with accents removal', DesaccUtf8Splitter
)
170 element_factory
.registerFactory( 'Word Splitter',
171 'UTF-8 HTML Aware splitter with accents removal', DesaccUtf8HTMLAwareSplitter
)
174 # in case the splitter is already registred, ValueError is raised