[Hejes-devel] [779] spell.py: hunspell only; skip suggestions that contain spaces and a token is only 1 char.

Wed May 29 17:59:00 CEST 2013

Revision: 779
Author:   mihaltz
Date:     2013-05-29 17:59:00 +0200 (Wed, 29 May 2013)
Log Message:
-----------
spell.py: hunspell only; skip suggestions that contain spaces and a token is only 1 char. long

Modified Paths:
--------------
    trunk/web2py/applications/helyesiras_webdev/modules/spell.py

Modified: trunk/web2py/applications/helyesiras_webdev/modules/spell.py
===================================================================

--- trunk/web2py/applications/helyesiras_webdev/modules/spell.py	2013-05-29 15:18:43 UTC (rev 778)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell.py	2013-05-29 15:59:00 UTC (rev 779)
@@ -152,7 +152,7 @@
 
 def union_humor_hunspell(unicode_toks):
   """This is an experimental modification of union_humor_hunspell().
-     Calls only hunspell. Suggestions are filtered: skip if contains space, only top TOPNSUGG
+     Calls only hunspell. Suggestions are filtered: skip if contains space and any token is 1 char long, keep only up to TOPNSUGG suggestions.
      ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
      regardless of either engines' outputs.
      NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
@@ -161,9 +161,6 @@
      [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
      Returns [None, <error message>] if some critical error occured.
      Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
-
-     token_is_correct: True if either of the 2 engines recognized the token.
-     list_of_suggestions: union of the 2 engines' suggestions.
   """
   ret = []
   # convert input to normal strings (utf8 encoding)
@@ -174,25 +171,23 @@
   ret = hunspout
   for i, x in enumerate(hunspout):
     # override with exception dictionary (if applicable)
-    if len(hunspout) == len(inp): # for safety
+    if len(hunspout) == len(inp): # safety check
       e = SPELL_EXC_DICT.get(inp[i])
       if e != None:
         ret[i] = e
         continue
-    # filter suggestions
+    # filter suggestions & convert them to unicode
     if ret[i][0] == False:
       tmp = []
       for s in ret[i][1]:
-        if ' ' in s:
+        t = unicode(s, 'utf8', 'replace')
+        sx = t.split(' ')
+        if len(sx) > 1 and len([x for x in sx if len(x) == 1]) > 0: # skip suggestion if contains space-delimited tokens and any token is only 1 char long (e.g. "u száj" for "uszáj")
           continue
-        tmp.append(s)
-        if len(tmp) == TOPNSUGG:
+        tmp.append(t)
+        if len(tmp) == TOPNSUGG: # no more than TOPNSUGG
           break
       ret[i] = (False, tmp)
-  # convert suggestions to Unicode strings
-  for i in range(0, len(ret)):
-    if ret[i][0] == False:
-      ret[i] = (False, [unicode(x, 'utf8', 'replace') for x in ret[i][1]])
   return ret