[Hejes-devel] [770] spell.py: use only hunspell, top 5 suggestions, no suggestions with spaces

Thu May 23 17:01:06 CEST 2013

Revision: 770
Author:   mihaltz
Date:     2013-05-23 17:01:06 +0200 (Thu, 23 May 2013)
Log Message:
-----------
spell.py: use only hunspell, top 5 suggestions, no suggestions with spaces

Modified Paths:
--------------
    trunk/web2py/applications/helyesiras_webdev/modules/spell.py

Modified: trunk/web2py/applications/helyesiras_webdev/modules/spell.py
===================================================================

--- trunk/web2py/applications/helyesiras_webdev/modules/spell.py	2013-05-22 14:24:29 UTC (rev 769)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell.py	2013-05-23 15:01:06 UTC (rev 770)
@@ -29,6 +29,9 @@
 CMD_HUNSPELL = ["hunspell", "-d", "hu_HU", "-i", "utf-8"]
 """Command-line string for calling hunspell"""
 
+TOPNSUGG = 5
+"""Use only top n spell suggestions from the engine(s) for unknown tokens"""
+
 PRONOUNS = {
 'TPe3': u'ő',
 'TPt1': u'mi',
@@ -148,7 +151,8 @@
 
 
 def union_humor_hunspell(unicode_toks):
-  """Calls humor + hunspell on elements of unicode_toks, which is an array of Unicode strings (tokens).
+  """This is an experimental modification of union_humor_hunspell().
+     Calls only hunspell. Suggestions are filtered: skip if contains space, only top TOPNSUGG
      ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
      regardless of either engines' outputs.
      NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
@@ -164,6 +168,52 @@
   ret = []
   # convert input to normal strings (utf8 encoding)
   inp = [x.encode('utf8') for x in unicode_toks]
+  # call the engine
+  hunspout = call_hunspell_cmdline_ntok(inp)
+  # process
+  ret = hunspout
+  for i, x in enumerate(hunspout):
+    # override with exception dictionary (if applicable)
+    if len(hunspout) == len(inp): # for safety
+      e = SPELL_EXC_DICT.get(inp[i])
+      if e != None:
+        ret[i] = e
+        continue
+    # filter suggestions
+    if ret[i][0] == False:
+      tmp = []
+      for s in ret[i][1]:
+        if ' ' in s:
+          continue
+        tmp.append(s)
+        if len(tmp) == TOPNSUGG:
+          break
+      ret[i] = (False, tmp)
+  # convert suggestions to Unicode strings
+  for i in range(0, len(ret)):
+    if ret[i][0] == False:
+      ret[i] = (False, [unicode(x, 'utf8', 'replace') for x in ret[i][1]])
+  return ret
+
+
+def union_humor_hunspell_DEPRECATED(unicode_toks):
+  """DEPRECATED. See union_humor_hunspell().
+     Calls humor + hunspell on elements of unicode_toks, which is an array of Unicode strings (tokens).
+     ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
+     regardless of either engines' outputs.
+     NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
+     Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+     token_is_correct: True if the nth token was recognized, False otherwise
+     [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
+     Returns [None, <error message>] if some critical error occured.
+     Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+
+     token_is_correct: True if either of the 2 engines recognized the token.
+     list_of_suggestions: union of the 2 engines' suggestions.
+  """
+  ret = []
+  # convert input to normal strings (utf8 encoding)
+  inp = [x.encode('utf8') for x in unicode_toks]
   # call the 2 engines
   humorout = call_humor_spellchecker_cmdline_ntok(inp)
   hunspout = call_hunspell_cmdline_ntok(inp)