[Hejes-devel] [779] spell.py: hunspell only; skip suggestions that contain spaces and a token is only 1 char.
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Wed May 29 17:59:00 CEST 2013
Revision: 779
Author: mihaltz
Date: 2013-05-29 17:59:00 +0200 (Wed, 29 May 2013)
Log Message:
-----------
spell.py: hunspell only; skip suggestions that contain spaces and a token is only 1 char. long
Modified Paths:
--------------
trunk/web2py/applications/helyesiras_webdev/modules/spell.py
Modified: trunk/web2py/applications/helyesiras_webdev/modules/spell.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/spell.py 2013-05-29 15:18:43 UTC (rev 778)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell.py 2013-05-29 15:59:00 UTC (rev 779)
@@ -152,7 +152,7 @@
def union_humor_hunspell(unicode_toks):
"""This is an experimental modification of union_humor_hunspell().
- Calls only hunspell. Suggestions are filtered: skip if contains space, only top TOPNSUGG
+ Calls only hunspell. Suggestions are filtered: skip if contains space and any token is 1 char long, keep only up to TOPNSUGG suggestions.
ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
regardless of either engines' outputs.
NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
@@ -161,9 +161,6 @@
[list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
Returns [None, <error message>] if some critical error occured.
Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
-
- token_is_correct: True if either of the 2 engines recognized the token.
- list_of_suggestions: union of the 2 engines' suggestions.
"""
ret = []
# convert input to normal strings (utf8 encoding)
@@ -174,25 +171,23 @@
ret = hunspout
for i, x in enumerate(hunspout):
# override with exception dictionary (if applicable)
- if len(hunspout) == len(inp): # for safety
+ if len(hunspout) == len(inp): # safety check
e = SPELL_EXC_DICT.get(inp[i])
if e != None:
ret[i] = e
continue
- # filter suggestions
+ # filter suggestions & convert them to unicode
if ret[i][0] == False:
tmp = []
for s in ret[i][1]:
- if ' ' in s:
+ t = unicode(s, 'utf8', 'replace')
+ sx = t.split(' ')
+ if len(sx) > 1 and len([x for x in sx if len(x) == 1]) > 0: # skip suggestion if contains space-delimited tokens and any token is only 1 char long (e.g. "u száj" for "uszáj")
continue
- tmp.append(s)
- if len(tmp) == TOPNSUGG:
+ tmp.append(t)
+ if len(tmp) == TOPNSUGG: # no more than TOPNSUGG
break
ret[i] = (False, tmp)
- # convert suggestions to Unicode strings
- for i in range(0, len(ret)):
- if ret[i][0] == False:
- ret[i] = (False, [unicode(x, 'utf8', 'replace') for x in ret[i][1]])
return ret
More information about the Hejes-devel
mailing list