82808258f23d4617fa125074cb09b4d8fdb50bea
[Utf8Splitter.git] / Utf8Splitter.py
1 # -*- coding: utf-8 -*-
2
3 """
4 Inspiré de l'UnicodeSplitter de Plone 2.1, avec un aller-retour Unicode <-> UTF-8 pour le découpage.
5
6 $Id: Utf8Splitter.py 16 2009-08-31 11:36:17Z pin $
7 $URL: http://svn.cri.ensmp.fr/svn/Utf8Splitter/trunk/Utf8Splitter.py $
8
9 """
10
11 # Python
12 import re
13 from htmlentitydefs import name2codepoint
14 from unicodedata import decomposition
15 from string import printable
16 import logging
17 from types import UnicodeType
18 console = logging.getLogger('Utf8Splitter')
19
20 # Zope
21 from Products.ZCTextIndex.ISplitter import ISplitter
22 from Products.ZCTextIndex.PipelineFactory import element_factory
23
24 rx = re.compile(r"\w+", re.UNICODE)
25 rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE)
26
27 rtag = re.compile(r"<[^<>]*>")
28 rent = re.compile(r"&(?P<entName>[A-Za-z]+);")
29
30 _printable = dict([(c, True) for c in printable])
31 isPrintable = _printable.has_key
32
33 class Utf8Splitter:
34 """Plain-text UTF-8 whitespace splitter
35 """
36 __implements__ = ISplitter
37
38 def process(self, lst, wordpat=rx):
39 result = []
40 for s in lst:
41 result += wordpat.findall(unicode(s, 'utf-8', errors='ignore'))
42 return [r.encode('utf-8') for r in result]
43
44 def processGlob(self, lst):
45 return self.process(lst, rxGlob)
46
47
48
49 class Utf8HTMLAwareSplitter :
50 """HTML-aware UTF-8 whitespace splitter
51 """
52 __implements__ = ISplitter
53
54 def process(self, lst, wordpat=rx):
55 result = []
56 for s in lst:
57 s = rtag.sub(' ', s)
58 s = rent.sub(_convertEnt, s)
59 s = s.decode('utf-8', 'ignore')
60
61 result += wordpat.findall(s)
62
63 return [r.encode('utf-8') for r in result]
64
65 def processGlob(self, lst):
66 return self.process(lst, rxGlob)
67
68
69
70 class DesaccUtf8Splitter(Utf8Splitter):
71 """Plain-text UTF-8 whitespace splitter with accents removal
72 """
73 def process(self, lst, wordpat=rx):
74 return Utf8Splitter.process(self, [_desacc(s) for s in lst], wordpat)
75
76
77
78 class DesaccUtf8HTMLAwareSplitter(Utf8HTMLAwareSplitter):
79 """HTML-aware UTF-8 whitespace splitter with accents removal
80 """
81 def process(self, lst, wordpat=rx):
82 result = []
83 for s in lst:
84 s = rtag.sub(' ', s)
85 s = rent.sub(_convertEnt, s)
86 s = _desacc(s)
87
88 result += wordpat.findall(s)
89
90 return [r.encode('utf-8') for r in result]
91
92 class _Utf8Utils(object) :
93
94 _singleton = None
95
96 def __new__(cls) :
97 if cls._singleton is None :
98 cls._singleton = object.__new__(cls)
99 return cls._singleton
100
101
102 def __init__(self) :
103 self._cache = {}
104
105 @staticmethod
106 def convertEnt(m):
107 """Conversion d'une entité HTML en sa représentation UTF-8
108 """
109 return unichr(name2codepoint.get(m.group('entName'), 32)).encode('utf-8')
110
111 def udesacc(self, uchaine) :
112 ret = []
113 for uc in uchaine :
114 ret.append(self._cache.get(uc) or self._recurseDecomposition(uc))
115
116 return u''.join(ret)
117
118 def desacc(self, chaine):
119 """Désaccentuation d'une chaîne UTF-8
120 """
121 try :
122 uchaine = chaine.decode('utf-8', 'ignore')
123 except UnicodeEncodeError :
124 if type(chaine) == UnicodeType :
125 console.warn('already unicode value passed to desacc: %r' % chaine)
126 uchaine = chaine
127 else :
128 raise
129 ret = self.udesacc(uchaine)
130 return ret.encode('utf-8')
131
132
133 def _recurseDecomposition(self, uc):
134 deco = decomposition(uc).split()
135 fullDeco = []
136 if deco :
137 while (deco) :
138 code = deco.pop()
139 if code.startswith('<') :
140 continue
141 c = unichr(int(code, 16))
142 subDeco = decomposition(c).split()
143 if subDeco :
144 deco.extend(subDeco)
145 else :
146 fullDeco.append(c)
147 fullDeco.reverse()
148 else :
149 fullDeco.append(uc)
150
151 fullDeco = u''.join(filter(lambda c : isPrintable(c), fullDeco))
152 self._cache[uc] = fullDeco
153 return fullDeco
154
155 Utf8Utils = _Utf8Utils()
156
157 _desacc = Utf8Utils.desacc
158 _convertEnt = Utf8Utils.convertEnt
159
160 try:
161 element_factory.registerFactory( 'Word Splitter',
162 'UTF-8 Whitespace splitter', Utf8Splitter)
163
164 element_factory.registerFactory( 'Word Splitter',
165 'UTF-8 HTML Aware splitter', Utf8HTMLAwareSplitter)
166
167 element_factory.registerFactory( 'Word Splitter',
168 'UTF-8 Whitespace splitter with accents removal', DesaccUtf8Splitter)
169
170 element_factory.registerFactory( 'Word Splitter',
171 'UTF-8 HTML Aware splitter with accents removal', DesaccUtf8HTMLAwareSplitter)
172
173 except ValueError:
174 # in case the splitter is already registred, ValueError is raised
175 pass
176
177