[Hejes-devel] [1456] spell.py: complete overhaul, pyhunspell, akh11 and akh12 versions etc.

hejes-devel at nytud.hu hejes-devel at nytud.hu
Wed Sep 2 18:48:38 CEST 2015


Revision: 1456
Author:   mihaltz
Date:     2015-09-02 18:48:38 +0200 (Wed, 02 Sep 2015)
Log Message:
-----------
spell.py: complete overhaul, pyhunspell, akh11 and akh12 versions etc.

Modified Paths:
--------------
    trunk/web2py/applications/helyesiras_webdev/modules/spell.py

Added Paths:
-----------
    trunk/web2py/applications/helyesiras_webdev/modules/spell_deprecated_akh11.py
    trunk/web2py/applications/helyesiras_webdev/modules/spell_helpers.py

Modified: trunk/web2py/applications/helyesiras_webdev/modules/spell.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/spell.py	2015-09-02 16:02:59 UTC (rev 1455)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell.py	2015-09-02 16:48:38 UTC (rev 1456)
@@ -3,12 +3,15 @@
 
 """
 API for accessing humor-2005 and hunspell spellchecking and suggesting binaries
-+ news TODO writeme
+with lots of customizations (TODO doc writeme)
 
 @author: MM
 @requires: python 2.x
+
 """
+__author__ = 'mm'
 
+import hunspell
 import os
 import re
 import subprocess
@@ -20,14 +23,16 @@
 
 from MemcacheHelper import memcachememoize
 from egybekulon2_humor import StemmingAnalysis, HumorAna, Morph
+from spell_helpers import load_spell_exc_dict, init_pyhunspell
 
 USRINP_REGEXP = re.compile( unicode( r'^[0-9a-zA-ZáéíóöőúüűÁÉÍÓÖŐÚÜŰ \-\.\n\t%]*$', 'utf8' ) )
 """Regexp to check if user input contains only allowed characters. Use it on a Unicode object."""
 
-CMD_HUMOR_SPELLCHECKER = ['/home/projects/helyesiras/bin/humor2005_spell-suggest', '/home/projects/helyesiras/lib/humor2005', '1038', '65001']
-"""Command-line string for calling humor."""
-CMD_HUNSPELL = ["hunspell", "-d", "hu_HU",  "-p", os.path.dirname(os.path.realpath(__file__)) + '/resources/sajat_szotar', "-i", "utf-8"]
-"""Command-line string for calling hunspell"""
+# OBSOLETE:
+#CMD_HUMOR_SPELLCHECKER = ['/home/projects/helyesiras/bin/humor2005_spell-suggest', '/home/projects/helyesiras/lib/humor2005', '1038', '65001']
+#"""Command-line string for calling humor."""
+#CMD_HUNSPELL = ["hunspell", "-d", "hu_HU",  "-p", os.path.dirname(os.path.realpath(__file__)) + '/resources/sajat_szotar', "-i", "utf-8"]
+#"""Command-line string for calling hunspell"""
 
 TOPNSUGG = 5
 """Use only top n spell suggestions from the engine(s) for unknown tokens"""
@@ -44,7 +49,7 @@
 """Used by get_productive_tips()"""
 
 """Another exception dictionary, used before the engines: "word (utf8)" => "explaining text (utf8)"
-Explaining text may include markmin formatting codes (see http://www.web2py.com/init/static/markmin.html)
+Explaining text should include markmin formatting codes (see http://www.web2py.com/init/static/markmin.html)
 The special code `` ``:BR means line break.
 """
 HOMONYMS = {
@@ -55,116 +60,26 @@
 Felszólító módban, egyes szám, harmadik személyben két d-vel: „Ne ''kérdd'' az én siralmimnak okát” (Balassi) """
 }
 
-"""Exception dictionary, used before the engines
+"""Exception dictionaries to be used by hunspell, AkH11 and AkH12 versions
 File format: wordform TAB 1 (wf. is correct) or 0 (wf. is incorrect) TAB suggestion if incorrect (or empty if correct)
 """
-SPELL_EXC_DICT = {} # {wordform: (correct_or_not, [suggestion_if_notcorrect]), ... } -- keys and values are normal strings (utf8)
-try:
-  for line in open(os.path.dirname(os.path.realpath(__file__))+'/resources/spell_exceptions.tsv'):
-    line = line.strip()
-    if not line or line.startswith('#'):
-      continue
-    r = line.split('\t')
-    if len(r) not in [2, 3]:
-      sys.stderr.write('spell.py: error in exceptions file: {0}\n'.format(line))
-      continue
-    if r[1] == '1':
-      SPELL_EXC_DICT[r[0]] = (True, [])
-    elif r[1] == '0' and len(r) == 3:
-      SPELL_EXC_DICT[r[0]] = (False, [r[2]])
-except Exception as e:
-  sys.stderr.write('spell.py: exception while loading exceptions file:\n{0}\n'.format(str(e)))
-  SPELL_EXC_DICT = {}
-#print('Exception dictionary:')
-#print(str(SPELL_EXC_DICT))
+# {wordform: (correct_or_not, [suggestion_if_notcorrect]), ... } -- keys and values are normal strings (utf8)
+RESOURCES_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'resources') # absolute path to resources dir
+SPELL_EXC_DICT_AKH11 = load_spell_exc_dict(os.path.join(RESOURCES_PATH, 'spell_exceptions_akh11.tsv'))
+SPELL_EXC_DICT_AKH12 = load_spell_exc_dict(os.path.join(RESOURCES_PATH, 'spell_exceptions_akh12.tsv'))
 
+# Load pyhunspell with AkH 11 and 12 custom dictionaries:
+HUNSPELL_DICT_PATH = '/usr/share/hunspell'
+PYHUNSPELL_AKH11 = init_pyhunspell(os.path.join(HUNSPELL_DICT_PATH, 'hu_HU.dic'),
+                                   os.path.join(HUNSPELL_DICT_PATH, 'hu_HU.aff'),
+                                   os.path.join(RESOURCES_PATH, 'sajat_szotar_akh11')
+                                   )
+PYHUNSPELL_AKH12 = init_pyhunspell(os.path.join(HUNSPELL_DICT_PATH, 'hu_HU.dic'),
+                                   os.path.join(HUNSPELL_DICT_PATH, 'hu_HU.aff'),
+                                   os.path.join(RESOURCES_PATH, 'sajat_szotar_akh12')
+                                   )
 
- at memcachememoize
-def call_humor_spellchecker_cmdline_ntok(toks):
-  """Calls humor command-line spellchecker via a system call.
 
-     Parameter toks is an array containg the tokens to be analyzed (strings, utf-8 encoding).
-
-     Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
-     token_is_correct: True if the nth token was recognized, False otherwise
-     [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms, or [] if no suggestions available.
-
-     NOTE: comma characters in the input tokens are deleted before processing with the spellchecker.
-
-     Returns [None, <error message>] if some critical error occured.
-     Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
-  """
-  toks = [x.replace(',', '') for x in toks] # kill commas in input
-  import shlex
-  try:
-    inp = '\n'.join(toks)
-    p = subprocess.Popen(CMD_HUMOR_SPELLCHECKER, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-    (stout, sterr) = p.communicate(inp)
-    if p.returncode != 0:
-      return [None, 'Command returned with exitcode {0}'.format(p.returncode)]
-    ret = []
-    for x in stout.split('\n'):
-      x = x.rstrip()
-      if not x:
-        continue
-      if not x.startswith('%'):
-        ret.append((True, []))
-      else:
-        if '\t' not in x:
-          ret.append((False, []))
-        else:
-          ret.append( (False, x.split('\t')[1].split(',')) )
-    return ret
-  except:
-    return [None, 'Exception: {0}'.format(sys.exc_info())]
-
-
- at memcachememoize
-def call_hunspell_cmdline_ntok(toks):
-  """Calls hunspell command-line spellchecker via a system call.
-
-     Parameter toks is an array containg the tokens to be analyzed (strings, utf-8 encoding).
-
-     Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
-     token_is_correct: True if the nth token was recognized, False otherwise
-     [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms, or [] if no suggestions available.
-
-     NOTE: comma characters in the input tokens are deleted before processing with the spellchecker.
-
-     Returns [None, <error message>] if some critical error occured.
-     Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
-
-  """
-  toks = [x.replace(',', '') for x in toks] # kill commas in input
-  toks = [x.replace(':', ' ') for x in toks]
-  toks = [x.replace('_', ' ') for x in toks]
-  try:
-    inp = '\n'.join(toks)
-    p = subprocess.Popen(CMD_HUNSPELL, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-    (stout, sterr) = p.communicate(inp)
-    if p.returncode != 0:
-      return [None, 'Command returned with exitcode {0}'.format(p.returncode)]
-    ret = []
-    c = 0
-    for line in stout.split('\n'):
-      line = line.rstrip()
-      if c == 0: # skip first line: hunspell banner
-        c += 1
-        continue
-      if not line: # skip empty line
-        continue
-      if line[0] in '*+-': # token correct
-        ret.append( (True, []) )
-      elif line.endswith(' 0'): # token incorrect, no suggestions
-        ret.append( (False, []) )
-      else: # token incorrect, suggestions available
-        m = re.match("^[^:]*: (.*)$", line)
-        ret.append( (False, m.group(1).split(", ")) )
-    return ret
-  except:
-    return [None, 'Exception: {0}'.format(sys.exc_info())]
-
-
 def union_humor_hunspell(unicode_toks):
   """This is an experimental modification of union_humor_hunspell().
      Calls only hunspell. Suggestions are filtered: skip if contains space and any token is 1 char long, keep only up to TOPNSUGG suggestions.
@@ -206,61 +121,63 @@
   return ret
 
 
-def union_humor_hunspell_DEPRECATED(unicode_toks):
-  """DEPRECATED. See union_humor_hunspell().
-     Calls humor + hunspell on elements of unicode_toks, which is an array of Unicode strings (tokens).
-     ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
-     regardless of either engines' outputs.
-     NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
-     Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+def check_with_hunspell(hsobj, excdict, utoks):
+  """
+  :param hsobj: an initialized hunspell.HunSpell object (see PYHUNSPELL_AKH11|12)
+  :param excdict: loaded exception dictionary (see SPELL_EXC_DICT_AKH11|12)
+  :param utoks: list of tokens to analyze (unicode objects)
+  :return: [(token_is_correct, [list_of_suggestions]), ...]
      token_is_correct: True if the nth token was recognized, False otherwise
      [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
-     Returns [None, <error message>] if some critical error occured.
      Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
-
-     token_is_correct: True if either of the 2 engines recognized the token.
-     list_of_suggestions: union of the 2 engines' suggestions.
+  For each token:
+  0. preprocessing (kill commas etc.)
+  1. use hunspell (hsobj) to check if word form exists, get list of suggestions if not
+  2. use excdict to override false positives from hunspell
+  3. filter out nonsense suggestions, leave only TOPN suggestions
   """
   ret = []
+
   # convert input to normal strings (utf8 encoding)
-  inp = [x.encode('utf8') for x in unicode_toks]
-  # call the 2 engines
-  humorout = call_humor_spellchecker_cmdline_ntok(inp)
-  hunspout = call_hunspell_cmdline_ntok(inp)
-  if humorout[0] == None:
-    if hunspout[0] == None:
-      return [None, 'Double trouble: "{0}" + "{1}"'.format(humorout[1], hunspout[1])]
-    else:
-      return hunspout
-  if hunspout[0] == None:
-    return humorout # not None
-  if len(humorout) != len(hunspout): # neither is None: sanity check
-    return [None, 'Error: length of output from the 2 engines doesn''t match']
-  # merge them
-  ret = humorout
-  for i, x in enumerate(hunspout):
-    # --- hack: override with exception dictionary (if applicable)
-    if len(hunspout) == len(inp): # for safety
-      e = SPELL_EXC_DICT.get(inp[i])
-      if e != None:
-        ret[i] = e
-        continue
-    # --- hack end
-    if x[0] == True or ret[i][0] == True:
-      ret[i] = (True, [])
-    else:
-      # ret[i] = (False, list(set(x[1]+ret[i][1])))
-      # new merge method: add new hunspell suggestions to end of humor list
-      for s in x[1]:
-        if s not in ret[i][1]:
-          ret[i][1].append(s)
-  # convert suggestions to Unicode strings
-  for i in range(0, len(ret)):
-    if ret[i][0] == False:
-      ret[i] = (False, [unicode(x, 'utf8', 'replace') for x in ret[i][1]])
-  return ret
+  inp = [x.encode('utf8') for x in utoks]
+  # do some preprocessing
+  inp = [x.replace(',', '') for x in inp] # kill commas in input
+  inp = [x.replace(':', ' ') for x in inp]
+  inp = [x.replace('_', ' ') for x in inp]
 
+  #hunspout = call_hunspell_cmdline_ntok(inp)
+  # For each token:
+  for tok in inp:
 
+    # check in false positives dictionary first
+    exc = excdict.get(tok)
+    if exc is not None:
+      ret.append(exc) # overwrite with (known, suggs) from exception dict
+      continue # we're done with this token
+
+    # process with hunspell
+    known, suggs = False, []
+    known = hsobj.spell(tok)
+    if not known:
+      suggs = hsobj.suggest(tok)
+    ret.append( (known, suggs) ) # save
+
+    # filter suggestions
+    if not known:
+      tmp = []
+      for s in suggs:
+        sx = s.split(' ')
+        if len(sx) > 1 and len([x for x in sx if len(x) == 1]) > 0: # skip suggestion if contains space-delimited tokens and any token is only 1 char long (e.g. "u száj" for "uszáj")
+          continue
+        tmp.append(s)
+        if len(tmp) == TOPNSUGG: # no more than TOPNSUGG
+          break
+      ret[-1] = (False, tmp)
+
+  # convert back to unicode and return
+  return [(x[0], [unicode(y, 'utf8', 'replace') for y in x[1]]) for x in ret]
+
+
 def get_productive_tips(tok, hanas):
   """Returns unicode string with tips on correct language usage.
      Returns None if no tips can be associated.
@@ -360,43 +277,60 @@
 
 
 def process(utoks, db):
-  """New main entry point function for module.
+  """New main entry point function for module, starting from 2015 September (AkH 12!)
+     Uses 2 versions of hunspell tuned for AkH11 and AkH12.
      Param utoks: array of unicode strings, the input tokens
      Param db: a gluon.DAL object representing an open database connection
-     Returns an array containing 3-tuples for each token:
-     [(token_is_correct, suggestions, tips), ...]
+     Returns [{akh11: ..., akh12: ..., tips: ...}] (list of dictionaries for each input token)
      where:
-     token_is_correct: True if either of the 2 engines recognized the token
-     suggestions: list of unicode strings, union of the 2 engines' suggestions or [] if token_is_correct=True (or no suggestions available), see union_humor_hunspell()
-     tips is {field: value, ...} or {} (unicode strings)
+       akh11_correct: True iff hunspell (with AkH11 custom dicts) recognized the token
+       akh11_suggestions: list of unicode string suggestions or [] if token_is_correct=True (or no suggestions available)
+       akh12_correct: s.a.
+       akh12_suggestions: s.a.
+       tips:  {field: value, ...} or {} (unicode strings)
      Returns [None, <error message>] if some critical error occured.
      Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
   """
   ret = []
-  # call humor and hunspell suggestors
-  sugg = union_humor_hunspell(utoks)
-  if sugg[0] == None: # error?
-    return sugg
-  if len(sugg) != len(utoks): # sanity check
-    return [None, 'Error: len(sugg)!=len(utoks)']
-  if sugg == []: # no (meaningful) input
-    return []
-  for i, tok in enumerate(utoks):
-    # call humor for stemming and morph. analysis
-    tok8 = tok.encode('utf8').replace(',', '')
+
+  # For each token: add info from HOMONYMS, hunspell, dictionary tips, productive tips
+  for i, utok in enumerate(utoks):
+
+    # convert to utf8, kill commas
+    tok8 = utok.encode('utf8').replace(',', '')
+
     # check in HOMONYMS dict first
-    if tok8 in HOMONYMS:
-      ret.append( (True, [], {u'markmin': safe_unic(HOMONYMS[tok8])}) ) # 'markmin' key in tips: format in view with markmin
-      continue # no need for others
+    hom = HOMONYMS.get(tok8)
+    if hom is not None:
+      res = dict(akh11_correct=True, akh11_suggestions=[], akh12_correct=True, akh12_suggestions=[], tips={u'markmin': safe_unic(hom)})
+      ret.append(res)
+      continue # no need for other knowledge sources for this token
+
+    # call hunspell with AkH11 and AkH12 setups (returned: 1-element lists)
+    akh11 = check_with_hunspell(PYHUNSPELL_AKH11, SPELL_EXC_DICT_AKH11, [utok])
+    akh12 = check_with_hunspell(PYHUNSPELL_AKH12, SPELL_EXC_DICT_AKH12, [utok])
+
+    # call Humor for stemming and morph. analysis
     hanas = StemmingAnalysis(tok8).getAnas()
+
     # get dictionary-based tips
-    tips = get_dictionary_tips(tok8, hanas, db)
+    dtips = get_dictionary_tips(tok8, hanas, db)
+
     # get productive tips
     ptip = get_productive_tips(tok8, hanas)
-    if ptip != None:
-      tips['prod_tip'] = ptip
-    # merge with suggestions
-    ret.append( (sugg[i][0], sugg[i][1], tips) )
+
+    # save everything
+    res = {}
+    res['akh11_correct'] = akh11[0][0] if len(akh11) != 0 else False
+    res['akh11_suggestions'] = akh11[0][1] if len(akh11) != 0 else []
+    res['akh12_correct'] = akh12[0][0] if len(akh12) != 0 else False
+    res['akh12_suggestions'] = akh12[0][1] if len(akh12) != 0 else []
+    res['tips'] = dtips
+    if ptip is not None:
+      res['tips']['prod_tip'] = ptip
+    ret.append(res)
+
+  #return ret
   return ret
 
 
@@ -408,30 +342,6 @@
   else: return False
 
 
-def interactive_test(db):
-  """Interactive testing: user types a token to stdin, call analyzers and print results to stdout, repeat until blank line is entered.
-  """
-  print("Type words to analyze, blank line + <Enter> to exit")
-  while True:
-    inp = sys.stdin.readline().rstrip()
-    if not input_check(unicode(inp, 'utf8')):
-      print('A bemenet nem megengedett karakter(eke)t tartalmaz, kérjük, ellenőrizze! Csak a magyar ábécé kis- és nagybetűi, a számjegyek, a szóköz, a pont és a kötőjel, valamint a "%" karakter megengedettek.')
-      continue
-    if not inp: break
-    else: inp = inp.split(' ')
-    x = call_humor_spellchecker_cmdline_ntok(inp)
-    y = call_hunspell_cmdline_ntok(inp)
-    z = union_humor_hunspell([unicode(t, 'utf8') for t in inp])
-    p = process([unicode(t, 'utf8') for t in inp], db)
-    print('humor={0}\n'.format(x))
-    print('hunspell={0}\n'.format(y))
-    print('exc_dict={0}\n'.format([SPELL_EXC_DICT.get(x) for x in inp]))
-    print('unio={0}\n'.format(z))
-    print('process()={0}'.format(p))
-    print('')
-  return
-
-
 def safe_unic(s, enc='utf8'):
   """If s is a string, convert it to unicode using enc encoding.
      If s is already unicode, just return it.
@@ -446,6 +356,48 @@
     return unicode(str(s), enc, 'replace')
 
 
+def interactive_test(db):
+  """Interactive testing: user types a token to stdin, call analyzers and print results to stdout, repeat until blank line is entered.
+  """
+  import pprint
+  print("Type words to analyze, blank line + <Enter> to exit")
+  while True:
+
+    inp = sys.stdin.readline().rstrip()
+
+    if not input_check(unicode(inp, 'utf8')):
+      print('A bemenet nem megengedett karakter(eke)t tartalmaz, kérjük, ellenőrizze! Csak a magyar ábécé kis- és nagybetűi, a számjegyek, a szóköz, a pont és a kötőjel, valamint a "%" karakter megengedettek.')
+      continue
+
+    if not inp:
+      break
+
+    tinp = inp.split(' ')
+
+    print('')
+    p = process([unicode(t, 'utf8') for t in tinp], db)
+    print('process():')
+    pprint.pprint(p)
+    print('')
+
+    x1 = [SPELL_EXC_DICT_AKH11.get(x) for x in tinp]
+    print('SPELL_EXC_DICT_AKH11:')
+    pprint.pprint(x1)
+    print('')
+
+    x2 = [SPELL_EXC_DICT_AKH12.get(x) for x in tinp]
+    print('SPELL_EXC_DICT_AKH12:')
+    pprint.pprint(x2)
+    print('')
+
+    h = [HOMONYMS.get(x) for x in tinp]
+    print('HOMONYMS:')
+    pprint.pprint(h)
+    print('')
+
+  return
+
+
 if __name__ == '__main__':
 
   print 'Connecting to database...'

Added: trunk/web2py/applications/helyesiras_webdev/modules/spell_deprecated_akh11.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/spell_deprecated_akh11.py	                        (rev 0)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell_deprecated_akh11.py	2015-09-02 16:48:38 UTC (rev 1456)
@@ -0,0 +1,455 @@
+#!/usr/bin/env python
+# coding: UTF-8
+
+"""
+API for accessing humor-2005 and hunspell spellchecking and suggesting binaries
++ news TODO writeme
+
+ at author: MM
+ at requires: python 2.x
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+sys.path.append('/opt/web2py')
+from gluon import *
+from gluon.contrib.pymysql import escape_string
+
+from MemcacheHelper import memcachememoize
+from egybekulon2_humor import StemmingAnalysis, HumorAna, Morph
+
+USRINP_REGEXP = re.compile( unicode( r'^[0-9a-zA-ZáéíóöőúüűÁÉÍÓÖŐÚÜŰ \-\.\n\t%]*$', 'utf8' ) )
+"""Regexp to check if user input contains only allowed characters. Use it on a Unicode object."""
+
+CMD_HUMOR_SPELLCHECKER = ['/home/projects/helyesiras/bin/humor2005_spell-suggest', '/home/projects/helyesiras/lib/humor2005', '1038', '65001']
+"""Command-line string for calling humor."""
+CMD_HUNSPELL = ["hunspell", "-d", "hu_HU",  "-p", os.path.dirname(os.path.realpath(__file__)) + '/resources/sajat_szotar', "-i", "utf-8"]
+"""Command-line string for calling hunspell"""
+
+TOPNSUGG = 5
+"""Use only top n spell suggestions from the engine(s) for unknown tokens"""
+
+PRONOUNS = {
+'TPe3': u'ő',
+'TPt1': u'mi',
+'TPt2': u'ti',
+'TPt3': u'ők'
+}
+"""Used by get_productive_tips()"""
+
+VOWELS = [u'a', 'á', u'e', u'é', u'i', u'í', u'o', u'ó', u'ö', u'ő', u'u', u'ú', u'ü', u'ű']
+"""Used by get_productive_tips()"""
+
+"""Another exception dictionary, used before the engines: "word (utf8)" => "explaining text (utf8)"
+Explaining text may include markmin formatting codes (see http://www.web2py.com/init/static/markmin.html)
+The special code `` ``:BR means line break.
+"""
+HOMONYMS = {
+  "kérd": """1. ’''Kérd'' el a könyvet!’ (A ''kér'' ige egyes szám, második személyű, felszólító módú alakja.)
+
+2. ’Ő ''kérd'' valamit.’ (A kérd [’kérdez’] ige egyes szám, harmadik személyű, kijelentő módú, alanyi ragozású alakja.)`` ``:BR
+Ebben a formában ritkán használatos, inkább első és második személyben, pl. ’Miért ''kérded''?’`` ``:BR
+Felszólító módban, egyes szám, harmadik személyben két d-vel: „Ne ''kérdd'' az én siralmimnak okát” (Balassi) """
+}
+
+"""Exception dictionary, used before the engines
+File format: wordform TAB 1 (wf. is correct) or 0 (wf. is incorrect) TAB suggestion if incorrect (or empty if correct)
+"""
+SPELL_EXC_DICT = {} # {wordform: (correct_or_not, [suggestion_if_notcorrect]), ... } -- keys and values are normal strings (utf8)
+try:
+  for line in open(os.path.dirname(os.path.realpath(__file__))+'/resources/spell_exceptions.tsv'):
+    line = line.strip()
+    if not line or line.startswith('#'):
+      continue
+    r = line.split('\t')
+    if len(r) not in [2, 3]:
+      sys.stderr.write('spell.py: error in exceptions file: {0}\n'.format(line))
+      continue
+    if r[1] == '1':
+      SPELL_EXC_DICT[r[0]] = (True, [])
+    elif r[1] == '0' and len(r) == 3:
+      SPELL_EXC_DICT[r[0]] = (False, [r[2]])
+except Exception as e:
+  sys.stderr.write('spell.py: exception while loading exceptions file:\n{0}\n'.format(str(e)))
+  SPELL_EXC_DICT = {}
+#print('Exception dictionary:')
+#print(str(SPELL_EXC_DICT))
+
+
+ at memcachememoize
+def call_humor_spellchecker_cmdline_ntok(toks):
+  """Calls humor command-line spellchecker via a system call.
+
+     Parameter toks is an array containg the tokens to be analyzed (strings, utf-8 encoding).
+
+     Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+     token_is_correct: True if the nth token was recognized, False otherwise
+     [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms, or [] if no suggestions available.
+
+     NOTE: comma characters in the input tokens are deleted before processing with the spellchecker.
+
+     Returns [None, <error message>] if some critical error occured.
+     Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+  """
+  toks = [x.replace(',', '') for x in toks] # kill commas in input
+  import shlex
+  try:
+    inp = '\n'.join(toks)
+    p = subprocess.Popen(CMD_HUMOR_SPELLCHECKER, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    (stout, sterr) = p.communicate(inp)
+    if p.returncode != 0:
+      return [None, 'Command returned with exitcode {0}'.format(p.returncode)]
+    ret = []
+    for x in stout.split('\n'):
+      x = x.rstrip()
+      if not x:
+        continue
+      if not x.startswith('%'):
+        ret.append((True, []))
+      else:
+        if '\t' not in x:
+          ret.append((False, []))
+        else:
+          ret.append( (False, x.split('\t')[1].split(',')) )
+    return ret
+  except:
+    return [None, 'Exception: {0}'.format(sys.exc_info())]
+
+
+ at memcachememoize
+def call_hunspell_cmdline_ntok(toks):
+  """Calls hunspell command-line spellchecker via a system call.
+
+     Parameter toks is an array containg the tokens to be analyzed (strings, utf-8 encoding).
+
+     Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+     token_is_correct: True if the nth token was recognized, False otherwise
+     [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms, or [] if no suggestions available.
+
+     NOTE: comma characters in the input tokens are deleted before processing with the spellchecker.
+
+     Returns [None, <error message>] if some critical error occured.
+     Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+
+  """
+  toks = [x.replace(',', '') for x in toks] # kill commas in input
+  toks = [x.replace(':', ' ') for x in toks]
+  toks = [x.replace('_', ' ') for x in toks]
+  try:
+    inp = '\n'.join(toks)
+    p = subprocess.Popen(CMD_HUNSPELL, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    (stout, sterr) = p.communicate(inp)
+    if p.returncode != 0:
+      return [None, 'Command returned with exitcode {0}'.format(p.returncode)]
+    ret = []
+    c = 0
+    for line in stout.split('\n'):
+      line = line.rstrip()
+      if c == 0: # skip first line: hunspell banner
+        c += 1
+        continue
+      if not line: # skip empty line
+        continue
+      if line[0] in '*+-': # token correct
+        ret.append( (True, []) )
+      elif line.endswith(' 0'): # token incorrect, no suggestions
+        ret.append( (False, []) )
+      else: # token incorrect, suggestions available
+        m = re.match("^[^:]*: (.*)$", line)
+        ret.append( (False, m.group(1).split(", ")) )
+    return ret
+  except:
+    return [None, 'Exception: {0}'.format(sys.exc_info())]
+
+
+def union_humor_hunspell(unicode_toks):
+  """This is an experimental modification of union_humor_hunspell().
+     Calls only hunspell. Suggestions are filtered: skip if contains space and any token is 1 char long, keep only up to TOPNSUGG suggestions.
+     ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
+     regardless of either engines' outputs.
+     NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
+     Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+     token_is_correct: True if the nth token was recognized, False otherwise
+     [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
+     Returns [None, <error message>] if some critical error occured.
+     Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+  """
+  ret = []
+  # convert input to normal strings (utf8 encoding)
+  inp = [x.encode('utf8') for x in unicode_toks]
+  # call the engine
+  hunspout = call_hunspell_cmdline_ntok(inp)
+  # process
+  ret = hunspout
+  for i, x in enumerate(hunspout):
+    # override with exception dictionary (if applicable)
+    if len(hunspout) == len(inp): # safety check
+      e = SPELL_EXC_DICT.get(inp[i])
+      if e != None:
+        ret[i] = e
+        continue
+    # filter suggestions & convert them to unicode
+    if ret[i][0] == False:
+      tmp = []
+      for s in ret[i][1]:
+        t = unicode(s, 'utf8', 'replace')
+        sx = t.split(' ')
+        if len(sx) > 1 and len([x for x in sx if len(x) == 1]) > 0: # skip suggestion if contains space-delimited tokens and any token is only 1 char long (e.g. "u száj" for "uszáj")
+          continue
+        tmp.append(t)
+        if len(tmp) == TOPNSUGG: # no more than TOPNSUGG
+          break
+      ret[i] = (False, tmp)
+  return ret
+
+
+def union_humor_hunspell_DEPRECATED(unicode_toks):
+  """DEPRECATED. See union_humor_hunspell().
+     Calls humor + hunspell on elements of unicode_toks, which is an array of Unicode strings (tokens).
+     ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
+     regardless of either engines' outputs.
+     NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
+     Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+     token_is_correct: True if the nth token was recognized, False otherwise
+     [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
+     Returns [None, <error message>] if some critical error occured.
+     Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+
+     token_is_correct: True if either of the 2 engines recognized the token.
+     list_of_suggestions: union of the 2 engines' suggestions.
+  """
+  ret = []
+  # convert input to normal strings (utf8 encoding)
+  inp = [x.encode('utf8') for x in unicode_toks]
+  # call the 2 engines
+  humorout = call_humor_spellchecker_cmdline_ntok(inp)
+  hunspout = call_hunspell_cmdline_ntok(inp)
+  if humorout[0] == None:
+    if hunspout[0] == None:
+      return [None, 'Double trouble: "{0}" + "{1}"'.format(humorout[1], hunspout[1])]
+    else:
+      return hunspout
+  if hunspout[0] == None:
+    return humorout # not None
+  if len(humorout) != len(hunspout): # neither is None: sanity check
+    return [None, 'Error: length of output from the 2 engines doesn''t match']
+  # merge them
+  ret = humorout
+  for i, x in enumerate(hunspout):
+    # --- hack: override with exception dictionary (if applicable)
+    if len(hunspout) == len(inp): # for safety
+      e = SPELL_EXC_DICT.get(inp[i])
+      if e != None:
+        ret[i] = e
+        continue
+    # --- hack end
+    if x[0] == True or ret[i][0] == True:
+      ret[i] = (True, [])
+    else:
+      # ret[i] = (False, list(set(x[1]+ret[i][1])))
+      # new merge method: add new hunspell suggestions to end of humor list
+      for s in x[1]:
+        if s not in ret[i][1]:
+          ret[i][1].append(s)
+  # convert suggestions to Unicode strings
+  for i in range(0, len(ret)):
+    if ret[i][0] == False:
+      ret[i] = (False, [unicode(x, 'utf8', 'replace') for x in ret[i][1]])
+  return ret
+
+
+def get_productive_tips(tok, hanas):
+  """Returns unicode string with tips on correct language usage.
+     Returns None if no tips can be associated.
+     tok: the input token (utf8 string)
+     hanas: an array of HumorAna objects representing analyses tok (utf8 string fields!)
+  """
+  if hanas == []: # no analyses: return None
+    return None
+  for ana in hanas:
+    if ana.pos == "IGE":
+      if re.match(".*[bcfghjklmnprstvz]d$", tok): # msh. + -d(d) vegu igek
+        for morph in ana.morphs:
+      # kezd ~ kezdd, mond ~ mondd
+          if morph.tag == "e3":
+            tip = u"Ő " + unicode(tok, 'utf8') + u" [valamit]. (Harmadik személy, kijelentő mód. Felszólító módban, egyes szám, második személyben két d-vel.)"
+            return tip
+      # nakolas
+      for morph in ana.morphs:
+        if "TFt3" in morph.tag:
+          tip = u"Ők " + unicode(tok, 'utf8') + u" azt."
+          return tip
+      # suksuk
+      if re.match(".*(szt|[^s]t)$", ana.stem): # -t es -szt vegu igek
+        for morph in ana.morphs:
+          if re.match("TP(e3|t[123])", morph.tag):
+            pronoun = PRONOUNS[morph.tag]
+            tip = u"Azt akarják, hogy ne " + unicode(tok, 'utf8') + u" tovább."
+            return tip
+      elif re.match(".*st$", ana.stem): # -st vegu igek
+        for morph in ana.morphs:
+          if "TPt1" in morph.tag:
+            pronoun = PRONOUNS[morph.tag]
+            tip = u"Azt akarják, hogy ne " + unicode(tok, 'utf8') + u" tovább."
+            return tip
+    elif ana.pos == "FN":
+      # det = u"az" if tok[0].lower() in VOWELS else u"a"
+      # -ba/-be; -ban/-ben
+      for morph in ana.morphs: # -bV
+        if "ILL" in morph.tag:
+          #tip = u"Hová? kérdésre válaszolva, pl. elmentem " + det + u" " + unicode(tok, 'utf8') + u"."
+          tip = u"Általában a Hová? kérdésre válaszolva."
+          return tip
+        elif "INE" in morph.tag: # -bVn
+          #tip = u"Hol? kérdésre válaszolva, pl. " + det + u" " + unicode(tok, 'utf8') + u" vagyok."
+          tip = u"Általában a Hol? kérdésre válaszolva."
+          return tip
+  return None
+
+
+def get_dictionary_tips(tok, hanas, db):
+  """Returns dictionary with keys as tip field names and values as tip field values
+     (keys and values are unicode strings) associated with input token tok.
+     Returns empty dict if no data could be found.
+     tok: an input token (utf8 string)
+     hanas: an array of HumorAna objects representing analyses of the input token
+     db: a gluon.DAL object representing an open database connection to dbdict DB
+  """
+  ret = {}
+  # assemble set of strings to query: 
+  # 1. the surface form itself! e.g. gója => gója (surf), gó (*stem) ==> gólya :)
+  # 2. stems from humor
+  stems = set([tok])
+  for ana in hanas:
+    if ana.stem != '':
+      stems.add(ana.stem)
+  # 1. query paronyms with the stem(s)
+  for stem in stems:
+    sql = """SELECT * FROM sugg_types t
+             JOIN sugg_types_paronyms p ON t.id=p.id
+             WHERE t.actual={0}
+          """.format(escape_string(stem))
+    try:
+      res = db.executesql(sql, as_dict=True)
+    except: # any exception: return as unknown
+      return {}
+    # process
+    if len(res) > 0:
+      for k, v in res[0].items(): # TODO: what if len(res) > 1
+        if v != None:
+          ret[safe_unic(k)] = safe_unic(v)
+      break # TODO: what if >1 stem matches
+  # 2. query slangs with the surface form only
+  sql = """SELECT * FROM sugg_types t
+           JOIN sugg_types_slang s ON t.id=s.id
+           WHERE t.actual={0}
+        """.format(escape_string(tok))
+  try:
+    res = db.executesql(sql, as_dict=True)
+  except: # any exception: return as unknown
+    return {}
+  # process
+  if len(res) > 0:
+    for k, v in res[0].items(): # TODO: what if len(res) > 1
+      if v != None:
+        ret[safe_unic(k)] = safe_unic(v)
+  return ret
+
+
+def process(utoks, db):
+  """New main entry point function for module.
+     Param utoks: array of unicode strings, the input tokens
+     Param db: a gluon.DAL object representing an open database connection
+     Returns an array containing 3-tuples for each token:
+     [(token_is_correct, suggestions, tips), ...]
+     where:
+     token_is_correct: True if either of the 2 engines recognized the token
+     suggestions: list of unicode strings, union of the 2 engines' suggestions or [] if token_is_correct=True (or no suggestions available), see union_humor_hunspell()
+     tips is {field: value, ...} or {} (unicode strings)
+     Returns [None, <error message>] if some critical error occured.
+     Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+  """
+  ret = []
+  # call humor and hunspell suggestors
+  sugg = union_humor_hunspell(utoks)
+  if sugg[0] == None: # error?
+    return sugg
+  if len(sugg) != len(utoks): # sanity check
+    return [None, 'Error: len(sugg)!=len(utoks)']
+  if sugg == []: # no (meaningful) input
+    return []
+  for i, tok in enumerate(utoks):
+    # call humor for stemming and morph. analysis
+    tok8 = tok.encode('utf8').replace(',', '')
+    # check in HOMONYMS dict first
+    if tok8 in HOMONYMS:
+      ret.append( (True, [], {u'markmin': safe_unic(HOMONYMS[tok8])}) ) # 'markmin' key in tips: format in view with markmin
+      continue # no need for others
+    hanas = StemmingAnalysis(tok8).getAnas()
+    # get dictionary-based tips
+    tips = get_dictionary_tips(tok8, hanas, db)
+    # get productive tips
+    ptip = get_productive_tips(tok8, hanas)
+    if ptip != None:
+      tips['prod_tip'] = ptip
+    # merge with suggestions
+    ret.append( (sugg[i][0], sugg[i][1], tips) )
+  return ret
+
+
+def input_check(inp):
+  """Return True iff inp conforms to USRINP_REGEXP.
+     Note: inp should be unicode string!!!
+  """
+  if USRINP_REGEXP.match(inp): return True
+  else: return False
+
+
+def interactive_test(db):
+  """Interactive testing: user types a token to stdin, call analyzers and print results to stdout, repeat until blank line is entered.
+  """
+  print("Type words to analyze, blank line + <Enter> to exit")
+  while True:
+    inp = sys.stdin.readline().rstrip()
+    if not input_check(unicode(inp, 'utf8')):
+      print('A bemenet nem megengedett karakter(eke)t tartalmaz, kérjük, ellenőrizze! Csak a magyar ábécé kis- és nagybetűi, a számjegyek, a szóköz, a pont és a kötőjel, valamint a "%" karakter megengedettek.')
+      continue
+    if not inp: break
+    else: inp = inp.split(' ')
+    x = call_humor_spellchecker_cmdline_ntok(inp)
+    y = call_hunspell_cmdline_ntok(inp)
+    z = union_humor_hunspell([unicode(t, 'utf8') for t in inp])
+    p = process([unicode(t, 'utf8') for t in inp], db)
+    print('humor={0}\n'.format(x))
+    print('hunspell={0}\n'.format(y))
+    print('exc_dict={0}\n'.format([SPELL_EXC_DICT.get(x) for x in inp]))
+    print('unio={0}\n'.format(z))
+    print('process()={0}'.format(p))
+    print('')
+  return
+
+
+def safe_unic(s, enc='utf8'):
+  """If s is a string, convert it to unicode using enc encoding.
+     If s is already unicode, just return it.
+     This is a workaround: different versions of web2py return string/unicode in DB query.
+     Also, different column values are returned as either str of unicode depending on collation.
+  """
+  if isinstance(s, str):
+    return unicode(s, enc, 'replace')
+  elif isinstance(s, unicode):
+    return s
+  else: # ???
+    return unicode(str(s), enc, 'replace')
+
+
+if __name__ == '__main__':
+
+  print 'Connecting to database...'
+  db = DAL('mysql://dbdicter:dbdicter123@localhost/dbdict')
+  print 'Done'
+
+  interactive_test(db)

Added: trunk/web2py/applications/helyesiras_webdev/modules/spell_helpers.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/spell_helpers.py	                        (rev 0)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell_helpers.py	2015-09-02 16:48:38 UTC (rev 1456)
@@ -0,0 +1,53 @@
+"""
+Helper functions for spell.py
+"""
+__author__ = 'mm'
+
+import hunspell
+import sys
+
+
+def load_spell_exc_dict(filename):
+  """
+  :param filename: name of tsv file with false positives
+  :return: dict (see SPELL_EXC_DICT_AKH11|2), may be empty
+  """
+  ret = {}
+  try:
+    for line in open(filename):
+      line = line.strip()
+      if not line or line.startswith('#'):
+        continue
+      r = line.split('\t')
+      if len(r) not in [2, 3]:
+        sys.stderr.write('spell.py: error in exceptions file: {0}\n'.format(line))
+        continue
+      if r[1] == '1':
+        ret[r[0]] = (True, [])
+      elif r[1] == '0' and len(r) == 3:
+        ret[r[0]] = (False, [r[2]])
+  except Exception as e:
+    sys.stderr.write('spell.py: exception while loading exceptions file {0}:\n{1}\n'.format(filename, str(e)))
+    return {}
+  return ret
+
+
+def init_pyhunspell(dicfile, afffile, customdictfile):
+  """
+  Initialize a hunspell.Hunspell object with the dic and aff file and load custom dictionary entries.
+  :param dicfile: name of .dic file
+  :param afffile: name of .aff file
+  :param customdictfile: name of custom dictionary file (lines: "unknown_word(\/known_analoguos_word)?"), UTF-8 encoding
+  :return: an initialized hunspell.Hunspell object
+  """
+  hs = hunspell.HunSpell(dicfile, afffile)
+  with open(customdictfile) as cdf:
+    for line in cdf:
+      fields = line.strip().split('/')
+      if len(fields) == 1:
+        hs.add(fields[0])
+      elif len(fields) == 2:
+        hs.add_with_affix(fields[0], fields[1])
+      else:
+        sys.stderr.write('Error in file {0}: "{1}"'.format(customdictfile, line))
+  return hs




More information about the Hejes-devel mailing list