facendo così: self.chardict={u"“": "'", u"”": "'", u"—": "-", u"’": "'", u"è": "e'", u"é": "e'"}
def gio_solution(self, s): SUBS = re.compile(u"([%s])" %'|'.join(self.chardict.keys())) return SUBS mi dice: File "extract_sentences.py", line 115, in <module> sentences = extract_sentences_from_webpage.get_sentences_from_text() File "extract_sentences.py", line 107, in get_sentences_from_text return self.sentences_extraction_meth() File "extract_sentences.py", line 100, in sentences_extraction_meth self.sentences_extracted_list = self.tokenizer.tokenize(self.extracted_text_u) File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1270, in tokenize return list(self.sentences_from_text(text, realign_boundaries)) File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1318, in sentences_from_text return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1309, in span_tokenize return [(sl.start, sl.stop) for sl in slices] File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1348, in _realign_boundaries for sl1, sl2 in _pair_iter(slices): File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 354, in _pair_iter prev = next(it) File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1322, in _slices_from_text for match in self._lang_vars.period_context_re().finditer(text): TypeError: expected string or buffer Facendo così: def elimina_accenti(self, stringa): return "".join((c for c in unicodedata.normalize('NFD', stringa) if unicodedata.category(c) != 'Mn')) self.extracted_text_u = self.elimina_accenti(unicode(self.extracted_text_u, "utf-8")) anche in questo caso non si hanno errori ma rimangono ancora dei caratteri spuri. Per cui penso che debba proprio approfondire l'uso di chardet, per la encoding detection. Il 29 gennaio 2015 10:32, Diego Barrera <diegonebarr...@yahoo.it> ha scritto: > Il 28/01/2015 18:15, Marco Ippolito ha scritto: >> >> Ciao a tutti, >> >> ho messo in file json alcune possibili sostituzioni di simboli >> all'interno di un testo: >> "to_substitute":{ >> "“": "'", >> "”": "'", >> "—": "-", >> "’": "'", >> "è": "e'", >> "é": "e'" >> } >> >> > > import unicodedata > > def elimina_accenti(stringa): > return ''.join((c for c in unicodedata.normalize('NFD', stringa) if > unicodedata.category(c) != 'Mn')) > > _______________________________________________ > Python mailing list > Python@lists.python.it > http://lists.python.it/mailman/listinfo/python _______________________________________________ Python mailing list Python@lists.python.it http://lists.python.it/mailman/listinfo/python