[Hejes-devel] [770] spell.py: use only hunspell, top 5 suggestions, no suggestions with spaces
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Thu May 23 17:01:06 CEST 2013
Revision: 770
Author: mihaltz
Date: 2013-05-23 17:01:06 +0200 (Thu, 23 May 2013)
Log Message:
-----------
spell.py: use only hunspell, top 5 suggestions, no suggestions with spaces
Modified Paths:
--------------
trunk/web2py/applications/helyesiras_webdev/modules/spell.py
Modified: trunk/web2py/applications/helyesiras_webdev/modules/spell.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/spell.py 2013-05-22 14:24:29 UTC (rev 769)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell.py 2013-05-23 15:01:06 UTC (rev 770)
@@ -29,6 +29,9 @@
CMD_HUNSPELL = ["hunspell", "-d", "hu_HU", "-i", "utf-8"]
"""Command-line string for calling hunspell"""
+TOPNSUGG = 5
+"""Use only top n spell suggestions from the engine(s) for unknown tokens"""
+
PRONOUNS = {
'TPe3': u'ő',
'TPt1': u'mi',
@@ -148,7 +151,8 @@
def union_humor_hunspell(unicode_toks):
- """Calls humor + hunspell on elements of unicode_toks, which is an array of Unicode strings (tokens).
+ """This is an experimental modification of union_humor_hunspell().
+ Calls only hunspell. Suggestions are filtered: skip if contains space, only top TOPNSUGG
ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
regardless of either engines' outputs.
NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
@@ -164,6 +168,52 @@
ret = []
# convert input to normal strings (utf8 encoding)
inp = [x.encode('utf8') for x in unicode_toks]
+ # call the engine
+ hunspout = call_hunspell_cmdline_ntok(inp)
+ # process
+ ret = hunspout
+ for i, x in enumerate(hunspout):
+ # override with exception dictionary (if applicable)
+ if len(hunspout) == len(inp): # for safety
+ e = SPELL_EXC_DICT.get(inp[i])
+ if e != None:
+ ret[i] = e
+ continue
+ # filter suggestions
+ if ret[i][0] == False:
+ tmp = []
+ for s in ret[i][1]:
+ if ' ' in s:
+ continue
+ tmp.append(s)
+ if len(tmp) == TOPNSUGG:
+ break
+ ret[i] = (False, tmp)
+ # convert suggestions to Unicode strings
+ for i in range(0, len(ret)):
+ if ret[i][0] == False:
+ ret[i] = (False, [unicode(x, 'utf8', 'replace') for x in ret[i][1]])
+ return ret
+
+
+def union_humor_hunspell_DEPRECATED(unicode_toks):
+ """DEPRECATED. See union_humor_hunspell().
+ Calls humor + hunspell on elements of unicode_toks, which is an array of Unicode strings (tokens).
+ ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
+ regardless of either engines' outputs.
+ NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
+ Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+ token_is_correct: True if the nth token was recognized, False otherwise
+ [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
+ Returns [None, <error message>] if some critical error occured.
+ Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+
+ token_is_correct: True if either of the 2 engines recognized the token.
+ list_of_suggestions: union of the 2 engines' suggestions.
+ """
+ ret = []
+ # convert input to normal strings (utf8 encoding)
+ inp = [x.encode('utf8') for x in unicode_toks]
# call the 2 engines
humorout = call_humor_spellchecker_cmdline_ntok(inp)
hunspout = call_hunspell_cmdline_ntok(inp)
More information about the Hejes-devel
mailing list