[Hejes-devel] [1456] spell.py: complete overhaul, pyhunspell, akh11 and akh12 versions etc.
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Wed Sep 2 18:48:38 CEST 2015
Revision: 1456
Author: mihaltz
Date: 2015-09-02 18:48:38 +0200 (Wed, 02 Sep 2015)
Log Message:
-----------
spell.py: complete overhaul, pyhunspell, akh11 and akh12 versions etc.
Modified Paths:
--------------
trunk/web2py/applications/helyesiras_webdev/modules/spell.py
Added Paths:
-----------
trunk/web2py/applications/helyesiras_webdev/modules/spell_deprecated_akh11.py
trunk/web2py/applications/helyesiras_webdev/modules/spell_helpers.py
Modified: trunk/web2py/applications/helyesiras_webdev/modules/spell.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/spell.py 2015-09-02 16:02:59 UTC (rev 1455)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell.py 2015-09-02 16:48:38 UTC (rev 1456)
@@ -3,12 +3,15 @@
"""
API for accessing humor-2005 and hunspell spellchecking and suggesting binaries
-+ news TODO writeme
+with lots of customizations (TODO doc writeme)
@author: MM
@requires: python 2.x
+
"""
+__author__ = 'mm'
+import hunspell
import os
import re
import subprocess
@@ -20,14 +23,16 @@
from MemcacheHelper import memcachememoize
from egybekulon2_humor import StemmingAnalysis, HumorAna, Morph
+from spell_helpers import load_spell_exc_dict, init_pyhunspell
USRINP_REGEXP = re.compile( unicode( r'^[0-9a-zA-ZáéíóöőúüűÁÉÍÓÖŐÚÜŰ \-\.\n\t%]*$', 'utf8' ) )
"""Regexp to check if user input contains only allowed characters. Use it on a Unicode object."""
-CMD_HUMOR_SPELLCHECKER = ['/home/projects/helyesiras/bin/humor2005_spell-suggest', '/home/projects/helyesiras/lib/humor2005', '1038', '65001']
-"""Command-line string for calling humor."""
-CMD_HUNSPELL = ["hunspell", "-d", "hu_HU", "-p", os.path.dirname(os.path.realpath(__file__)) + '/resources/sajat_szotar', "-i", "utf-8"]
-"""Command-line string for calling hunspell"""
+# OBSOLETE:
+#CMD_HUMOR_SPELLCHECKER = ['/home/projects/helyesiras/bin/humor2005_spell-suggest', '/home/projects/helyesiras/lib/humor2005', '1038', '65001']
+#"""Command-line string for calling humor."""
+#CMD_HUNSPELL = ["hunspell", "-d", "hu_HU", "-p", os.path.dirname(os.path.realpath(__file__)) + '/resources/sajat_szotar', "-i", "utf-8"]
+#"""Command-line string for calling hunspell"""
TOPNSUGG = 5
"""Use only top n spell suggestions from the engine(s) for unknown tokens"""
@@ -44,7 +49,7 @@
"""Used by get_productive_tips()"""
"""Another exception dictionary, used before the engines: "word (utf8)" => "explaining text (utf8)"
-Explaining text may include markmin formatting codes (see http://www.web2py.com/init/static/markmin.html)
+Explaining text should include markmin formatting codes (see http://www.web2py.com/init/static/markmin.html)
The special code `` ``:BR means line break.
"""
HOMONYMS = {
@@ -55,116 +60,26 @@
Felszólító módban, egyes szám, harmadik személyben két d-vel: „Ne ''kérdd'' az én siralmimnak okát” (Balassi) """
}
-"""Exception dictionary, used before the engines
+"""Exception dictionaries to be used by hunspell, AkH11 and AkH12 versions
File format: wordform TAB 1 (wf. is correct) or 0 (wf. is incorrect) TAB suggestion if incorrect (or empty if correct)
"""
-SPELL_EXC_DICT = {} # {wordform: (correct_or_not, [suggestion_if_notcorrect]), ... } -- keys and values are normal strings (utf8)
-try:
- for line in open(os.path.dirname(os.path.realpath(__file__))+'/resources/spell_exceptions.tsv'):
- line = line.strip()
- if not line or line.startswith('#'):
- continue
- r = line.split('\t')
- if len(r) not in [2, 3]:
- sys.stderr.write('spell.py: error in exceptions file: {0}\n'.format(line))
- continue
- if r[1] == '1':
- SPELL_EXC_DICT[r[0]] = (True, [])
- elif r[1] == '0' and len(r) == 3:
- SPELL_EXC_DICT[r[0]] = (False, [r[2]])
-except Exception as e:
- sys.stderr.write('spell.py: exception while loading exceptions file:\n{0}\n'.format(str(e)))
- SPELL_EXC_DICT = {}
-#print('Exception dictionary:')
-#print(str(SPELL_EXC_DICT))
+# {wordform: (correct_or_not, [suggestion_if_notcorrect]), ... } -- keys and values are normal strings (utf8)
+RESOURCES_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'resources') # absolute path to resources dir
+SPELL_EXC_DICT_AKH11 = load_spell_exc_dict(os.path.join(RESOURCES_PATH, 'spell_exceptions_akh11.tsv'))
+SPELL_EXC_DICT_AKH12 = load_spell_exc_dict(os.path.join(RESOURCES_PATH, 'spell_exceptions_akh12.tsv'))
+# Load pyhunspell with AkH 11 and 12 custom dictionaries:
+HUNSPELL_DICT_PATH = '/usr/share/hunspell'
+PYHUNSPELL_AKH11 = init_pyhunspell(os.path.join(HUNSPELL_DICT_PATH, 'hu_HU.dic'),
+ os.path.join(HUNSPELL_DICT_PATH, 'hu_HU.aff'),
+ os.path.join(RESOURCES_PATH, 'sajat_szotar_akh11')
+ )
+PYHUNSPELL_AKH12 = init_pyhunspell(os.path.join(HUNSPELL_DICT_PATH, 'hu_HU.dic'),
+ os.path.join(HUNSPELL_DICT_PATH, 'hu_HU.aff'),
+ os.path.join(RESOURCES_PATH, 'sajat_szotar_akh12')
+ )
- at memcachememoize
-def call_humor_spellchecker_cmdline_ntok(toks):
- """Calls humor command-line spellchecker via a system call.
- Parameter toks is an array containg the tokens to be analyzed (strings, utf-8 encoding).
-
- Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
- token_is_correct: True if the nth token was recognized, False otherwise
- [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms, or [] if no suggestions available.
-
- NOTE: comma characters in the input tokens are deleted before processing with the spellchecker.
-
- Returns [None, <error message>] if some critical error occured.
- Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
- """
- toks = [x.replace(',', '') for x in toks] # kill commas in input
- import shlex
- try:
- inp = '\n'.join(toks)
- p = subprocess.Popen(CMD_HUMOR_SPELLCHECKER, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
- (stout, sterr) = p.communicate(inp)
- if p.returncode != 0:
- return [None, 'Command returned with exitcode {0}'.format(p.returncode)]
- ret = []
- for x in stout.split('\n'):
- x = x.rstrip()
- if not x:
- continue
- if not x.startswith('%'):
- ret.append((True, []))
- else:
- if '\t' not in x:
- ret.append((False, []))
- else:
- ret.append( (False, x.split('\t')[1].split(',')) )
- return ret
- except:
- return [None, 'Exception: {0}'.format(sys.exc_info())]
-
-
- at memcachememoize
-def call_hunspell_cmdline_ntok(toks):
- """Calls hunspell command-line spellchecker via a system call.
-
- Parameter toks is an array containg the tokens to be analyzed (strings, utf-8 encoding).
-
- Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
- token_is_correct: True if the nth token was recognized, False otherwise
- [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms, or [] if no suggestions available.
-
- NOTE: comma characters in the input tokens are deleted before processing with the spellchecker.
-
- Returns [None, <error message>] if some critical error occured.
- Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
-
- """
- toks = [x.replace(',', '') for x in toks] # kill commas in input
- toks = [x.replace(':', ' ') for x in toks]
- toks = [x.replace('_', ' ') for x in toks]
- try:
- inp = '\n'.join(toks)
- p = subprocess.Popen(CMD_HUNSPELL, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
- (stout, sterr) = p.communicate(inp)
- if p.returncode != 0:
- return [None, 'Command returned with exitcode {0}'.format(p.returncode)]
- ret = []
- c = 0
- for line in stout.split('\n'):
- line = line.rstrip()
- if c == 0: # skip first line: hunspell banner
- c += 1
- continue
- if not line: # skip empty line
- continue
- if line[0] in '*+-': # token correct
- ret.append( (True, []) )
- elif line.endswith(' 0'): # token incorrect, no suggestions
- ret.append( (False, []) )
- else: # token incorrect, suggestions available
- m = re.match("^[^:]*: (.*)$", line)
- ret.append( (False, m.group(1).split(", ")) )
- return ret
- except:
- return [None, 'Exception: {0}'.format(sys.exc_info())]
-
-
def union_humor_hunspell(unicode_toks):
"""This is an experimental modification of union_humor_hunspell().
Calls only hunspell. Suggestions are filtered: skip if contains space and any token is 1 char long, keep only up to TOPNSUGG suggestions.
@@ -206,61 +121,63 @@
return ret
-def union_humor_hunspell_DEPRECATED(unicode_toks):
- """DEPRECATED. See union_humor_hunspell().
- Calls humor + hunspell on elements of unicode_toks, which is an array of Unicode strings (tokens).
- ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
- regardless of either engines' outputs.
- NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
- Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+def check_with_hunspell(hsobj, excdict, utoks):
+ """
+ :param hsobj: an initialized hunspell.HunSpell object (see PYHUNSPELL_AKH11|12)
+ :param excdict: loaded exception dictionary (see SPELL_EXC_DICT_AKH11|12)
+ :param utoks: list of tokens to analyze (unicode objects)
+ :return: [(token_is_correct, [list_of_suggestions]), ...]
token_is_correct: True if the nth token was recognized, False otherwise
[list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
- Returns [None, <error message>] if some critical error occured.
Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
-
- token_is_correct: True if either of the 2 engines recognized the token.
- list_of_suggestions: union of the 2 engines' suggestions.
+ For each token:
+ 0. preprocessing (kill commas etc.)
+ 1. use hunspell (hsobj) to check if word form exists, get list of suggestions if not
+ 2. use excdict to override false positives from hunspell
+ 3. filter out nonsense suggestions, leave only TOPN suggestions
"""
ret = []
+
# convert input to normal strings (utf8 encoding)
- inp = [x.encode('utf8') for x in unicode_toks]
- # call the 2 engines
- humorout = call_humor_spellchecker_cmdline_ntok(inp)
- hunspout = call_hunspell_cmdline_ntok(inp)
- if humorout[0] == None:
- if hunspout[0] == None:
- return [None, 'Double trouble: "{0}" + "{1}"'.format(humorout[1], hunspout[1])]
- else:
- return hunspout
- if hunspout[0] == None:
- return humorout # not None
- if len(humorout) != len(hunspout): # neither is None: sanity check
- return [None, 'Error: length of output from the 2 engines doesn''t match']
- # merge them
- ret = humorout
- for i, x in enumerate(hunspout):
- # --- hack: override with exception dictionary (if applicable)
- if len(hunspout) == len(inp): # for safety
- e = SPELL_EXC_DICT.get(inp[i])
- if e != None:
- ret[i] = e
- continue
- # --- hack end
- if x[0] == True or ret[i][0] == True:
- ret[i] = (True, [])
- else:
- # ret[i] = (False, list(set(x[1]+ret[i][1])))
- # new merge method: add new hunspell suggestions to end of humor list
- for s in x[1]:
- if s not in ret[i][1]:
- ret[i][1].append(s)
- # convert suggestions to Unicode strings
- for i in range(0, len(ret)):
- if ret[i][0] == False:
- ret[i] = (False, [unicode(x, 'utf8', 'replace') for x in ret[i][1]])
- return ret
+ inp = [x.encode('utf8') for x in utoks]
+ # do some preprocessing
+ inp = [x.replace(',', '') for x in inp] # kill commas in input
+ inp = [x.replace(':', ' ') for x in inp]
+ inp = [x.replace('_', ' ') for x in inp]
+ #hunspout = call_hunspell_cmdline_ntok(inp)
+ # For each token:
+ for tok in inp:
+ # check in false positives dictionary first
+ exc = excdict.get(tok)
+ if exc is not None:
+ ret.append(exc) # overwrite with (known, suggs) from exception dict
+ continue # we're done with this token
+
+ # process with hunspell
+ known, suggs = False, []
+ known = hsobj.spell(tok)
+ if not known:
+ suggs = hsobj.suggest(tok)
+ ret.append( (known, suggs) ) # save
+
+ # filter suggestions
+ if not known:
+ tmp = []
+ for s in suggs:
+ sx = s.split(' ')
+ if len(sx) > 1 and len([x for x in sx if len(x) == 1]) > 0: # skip suggestion if contains space-delimited tokens and any token is only 1 char long (e.g. "u száj" for "uszáj")
+ continue
+ tmp.append(s)
+ if len(tmp) == TOPNSUGG: # no more than TOPNSUGG
+ break
+ ret[-1] = (False, tmp)
+
+ # convert back to unicode and return
+ return [(x[0], [unicode(y, 'utf8', 'replace') for y in x[1]]) for x in ret]
+
+
def get_productive_tips(tok, hanas):
"""Returns unicode string with tips on correct language usage.
Returns None if no tips can be associated.
@@ -360,43 +277,60 @@
def process(utoks, db):
- """New main entry point function for module.
+ """New main entry point function for module, starting from 2015 September (AkH 12!)
+ Uses 2 versions of hunspell tuned for AkH11 and AkH12.
Param utoks: array of unicode strings, the input tokens
Param db: a gluon.DAL object representing an open database connection
- Returns an array containing 3-tuples for each token:
- [(token_is_correct, suggestions, tips), ...]
+ Returns [{akh11: ..., akh12: ..., tips: ...}] (list of dictionaries for each input token)
where:
- token_is_correct: True if either of the 2 engines recognized the token
- suggestions: list of unicode strings, union of the 2 engines' suggestions or [] if token_is_correct=True (or no suggestions available), see union_humor_hunspell()
- tips is {field: value, ...} or {} (unicode strings)
+ akh11_correct: True iff hunspell (with AkH11 custom dicts) recognized the token
+ akh11_suggestions: list of unicode string suggestions or [] if token_is_correct=True (or no suggestions available)
+ akh12_correct: s.a.
+ akh12_suggestions: s.a.
+ tips: {field: value, ...} or {} (unicode strings)
Returns [None, <error message>] if some critical error occured.
Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
"""
ret = []
- # call humor and hunspell suggestors
- sugg = union_humor_hunspell(utoks)
- if sugg[0] == None: # error?
- return sugg
- if len(sugg) != len(utoks): # sanity check
- return [None, 'Error: len(sugg)!=len(utoks)']
- if sugg == []: # no (meaningful) input
- return []
- for i, tok in enumerate(utoks):
- # call humor for stemming and morph. analysis
- tok8 = tok.encode('utf8').replace(',', '')
+
+ # For each token: add info from HOMONYMS, hunspell, dictionary tips, productive tips
+ for i, utok in enumerate(utoks):
+
+ # convert to utf8, kill commas
+ tok8 = utok.encode('utf8').replace(',', '')
+
# check in HOMONYMS dict first
- if tok8 in HOMONYMS:
- ret.append( (True, [], {u'markmin': safe_unic(HOMONYMS[tok8])}) ) # 'markmin' key in tips: format in view with markmin
- continue # no need for others
+ hom = HOMONYMS.get(tok8)
+ if hom is not None:
+ res = dict(akh11_correct=True, akh11_suggestions=[], akh12_correct=True, akh12_suggestions=[], tips={u'markmin': safe_unic(hom)})
+ ret.append(res)
+ continue # no need for other knowledge sources for this token
+
+ # call hunspell with AkH11 and AkH12 setups (returned: 1-element lists)
+ akh11 = check_with_hunspell(PYHUNSPELL_AKH11, SPELL_EXC_DICT_AKH11, [utok])
+ akh12 = check_with_hunspell(PYHUNSPELL_AKH12, SPELL_EXC_DICT_AKH12, [utok])
+
+ # call Humor for stemming and morph. analysis
hanas = StemmingAnalysis(tok8).getAnas()
+
# get dictionary-based tips
- tips = get_dictionary_tips(tok8, hanas, db)
+ dtips = get_dictionary_tips(tok8, hanas, db)
+
# get productive tips
ptip = get_productive_tips(tok8, hanas)
- if ptip != None:
- tips['prod_tip'] = ptip
- # merge with suggestions
- ret.append( (sugg[i][0], sugg[i][1], tips) )
+
+ # save everything
+ res = {}
+ res['akh11_correct'] = akh11[0][0] if len(akh11) != 0 else False
+ res['akh11_suggestions'] = akh11[0][1] if len(akh11) != 0 else []
+ res['akh12_correct'] = akh12[0][0] if len(akh12) != 0 else False
+ res['akh12_suggestions'] = akh12[0][1] if len(akh12) != 0 else []
+ res['tips'] = dtips
+ if ptip is not None:
+ res['tips']['prod_tip'] = ptip
+ ret.append(res)
+
+ #return ret
return ret
@@ -408,30 +342,6 @@
else: return False
-def interactive_test(db):
- """Interactive testing: user types a token to stdin, call analyzers and print results to stdout, repeat until blank line is entered.
- """
- print("Type words to analyze, blank line + <Enter> to exit")
- while True:
- inp = sys.stdin.readline().rstrip()
- if not input_check(unicode(inp, 'utf8')):
- print('A bemenet nem megengedett karakter(eke)t tartalmaz, kérjük, ellenőrizze! Csak a magyar ábécé kis- és nagybetűi, a számjegyek, a szóköz, a pont és a kötőjel, valamint a "%" karakter megengedettek.')
- continue
- if not inp: break
- else: inp = inp.split(' ')
- x = call_humor_spellchecker_cmdline_ntok(inp)
- y = call_hunspell_cmdline_ntok(inp)
- z = union_humor_hunspell([unicode(t, 'utf8') for t in inp])
- p = process([unicode(t, 'utf8') for t in inp], db)
- print('humor={0}\n'.format(x))
- print('hunspell={0}\n'.format(y))
- print('exc_dict={0}\n'.format([SPELL_EXC_DICT.get(x) for x in inp]))
- print('unio={0}\n'.format(z))
- print('process()={0}'.format(p))
- print('')
- return
-
-
def safe_unic(s, enc='utf8'):
"""If s is a string, convert it to unicode using enc encoding.
If s is already unicode, just return it.
@@ -446,6 +356,48 @@
return unicode(str(s), enc, 'replace')
+def interactive_test(db):
+ """Interactive testing: user types a token to stdin, call analyzers and print results to stdout, repeat until blank line is entered.
+ """
+ import pprint
+ print("Type words to analyze, blank line + <Enter> to exit")
+ while True:
+
+ inp = sys.stdin.readline().rstrip()
+
+ if not input_check(unicode(inp, 'utf8')):
+ print('A bemenet nem megengedett karakter(eke)t tartalmaz, kérjük, ellenőrizze! Csak a magyar ábécé kis- és nagybetűi, a számjegyek, a szóköz, a pont és a kötőjel, valamint a "%" karakter megengedettek.')
+ continue
+
+ if not inp:
+ break
+
+ tinp = inp.split(' ')
+
+ print('')
+ p = process([unicode(t, 'utf8') for t in tinp], db)
+ print('process():')
+ pprint.pprint(p)
+ print('')
+
+ x1 = [SPELL_EXC_DICT_AKH11.get(x) for x in tinp]
+ print('SPELL_EXC_DICT_AKH11:')
+ pprint.pprint(x1)
+ print('')
+
+ x2 = [SPELL_EXC_DICT_AKH12.get(x) for x in tinp]
+ print('SPELL_EXC_DICT_AKH12:')
+ pprint.pprint(x2)
+ print('')
+
+ h = [HOMONYMS.get(x) for x in tinp]
+ print('HOMONYMS:')
+ pprint.pprint(h)
+ print('')
+
+ return
+
+
if __name__ == '__main__':
print 'Connecting to database...'
Added: trunk/web2py/applications/helyesiras_webdev/modules/spell_deprecated_akh11.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/spell_deprecated_akh11.py (rev 0)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell_deprecated_akh11.py 2015-09-02 16:48:38 UTC (rev 1456)
@@ -0,0 +1,455 @@
+#!/usr/bin/env python
+# coding: UTF-8
+
+"""
+API for accessing humor-2005 and hunspell spellchecking and suggesting binaries
++ news TODO writeme
+
+ at author: MM
+ at requires: python 2.x
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+sys.path.append('/opt/web2py')
+from gluon import *
+from gluon.contrib.pymysql import escape_string
+
+from MemcacheHelper import memcachememoize
+from egybekulon2_humor import StemmingAnalysis, HumorAna, Morph
+
+USRINP_REGEXP = re.compile( unicode( r'^[0-9a-zA-ZáéíóöőúüűÁÉÍÓÖŐÚÜŰ \-\.\n\t%]*$', 'utf8' ) )
+"""Regexp to check if user input contains only allowed characters. Use it on a Unicode object."""
+
+CMD_HUMOR_SPELLCHECKER = ['/home/projects/helyesiras/bin/humor2005_spell-suggest', '/home/projects/helyesiras/lib/humor2005', '1038', '65001']
+"""Command-line string for calling humor."""
+CMD_HUNSPELL = ["hunspell", "-d", "hu_HU", "-p", os.path.dirname(os.path.realpath(__file__)) + '/resources/sajat_szotar', "-i", "utf-8"]
+"""Command-line string for calling hunspell"""
+
+TOPNSUGG = 5
+"""Use only top n spell suggestions from the engine(s) for unknown tokens"""
+
+PRONOUNS = {
+'TPe3': u'ő',
+'TPt1': u'mi',
+'TPt2': u'ti',
+'TPt3': u'ők'
+}
+"""Used by get_productive_tips()"""
+
+VOWELS = [u'a', 'á', u'e', u'é', u'i', u'í', u'o', u'ó', u'ö', u'ő', u'u', u'ú', u'ü', u'ű']
+"""Used by get_productive_tips()"""
+
+"""Another exception dictionary, used before the engines: "word (utf8)" => "explaining text (utf8)"
+Explaining text may include markmin formatting codes (see http://www.web2py.com/init/static/markmin.html)
+The special code `` ``:BR means line break.
+"""
+HOMONYMS = {
+ "kérd": """1. ’''Kérd'' el a könyvet!’ (A ''kér'' ige egyes szám, második személyű, felszólító módú alakja.)
+
+2. ’Ő ''kérd'' valamit.’ (A kérd [’kérdez’] ige egyes szám, harmadik személyű, kijelentő módú, alanyi ragozású alakja.)`` ``:BR
+Ebben a formában ritkán használatos, inkább első és második személyben, pl. ’Miért ''kérded''?’`` ``:BR
+Felszólító módban, egyes szám, harmadik személyben két d-vel: „Ne ''kérdd'' az én siralmimnak okát” (Balassi) """
+}
+
+"""Exception dictionary, used before the engines
+File format: wordform TAB 1 (wf. is correct) or 0 (wf. is incorrect) TAB suggestion if incorrect (or empty if correct)
+"""
+SPELL_EXC_DICT = {} # {wordform: (correct_or_not, [suggestion_if_notcorrect]), ... } -- keys and values are normal strings (utf8)
+try:
+ for line in open(os.path.dirname(os.path.realpath(__file__))+'/resources/spell_exceptions.tsv'):
+ line = line.strip()
+ if not line or line.startswith('#'):
+ continue
+ r = line.split('\t')
+ if len(r) not in [2, 3]:
+ sys.stderr.write('spell.py: error in exceptions file: {0}\n'.format(line))
+ continue
+ if r[1] == '1':
+ SPELL_EXC_DICT[r[0]] = (True, [])
+ elif r[1] == '0' and len(r) == 3:
+ SPELL_EXC_DICT[r[0]] = (False, [r[2]])
+except Exception as e:
+ sys.stderr.write('spell.py: exception while loading exceptions file:\n{0}\n'.format(str(e)))
+ SPELL_EXC_DICT = {}
+#print('Exception dictionary:')
+#print(str(SPELL_EXC_DICT))
+
+
+ at memcachememoize
+def call_humor_spellchecker_cmdline_ntok(toks):
+ """Calls humor command-line spellchecker via a system call.
+
+ Parameter toks is an array containg the tokens to be analyzed (strings, utf-8 encoding).
+
+ Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+ token_is_correct: True if the nth token was recognized, False otherwise
+ [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms, or [] if no suggestions available.
+
+ NOTE: comma characters in the input tokens are deleted before processing with the spellchecker.
+
+ Returns [None, <error message>] if some critical error occured.
+ Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+ """
+ toks = [x.replace(',', '') for x in toks] # kill commas in input
+ import shlex
+ try:
+ inp = '\n'.join(toks)
+ p = subprocess.Popen(CMD_HUMOR_SPELLCHECKER, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ (stout, sterr) = p.communicate(inp)
+ if p.returncode != 0:
+ return [None, 'Command returned with exitcode {0}'.format(p.returncode)]
+ ret = []
+ for x in stout.split('\n'):
+ x = x.rstrip()
+ if not x:
+ continue
+ if not x.startswith('%'):
+ ret.append((True, []))
+ else:
+ if '\t' not in x:
+ ret.append((False, []))
+ else:
+ ret.append( (False, x.split('\t')[1].split(',')) )
+ return ret
+ except:
+ return [None, 'Exception: {0}'.format(sys.exc_info())]
+
+
+ at memcachememoize
+def call_hunspell_cmdline_ntok(toks):
+ """Calls hunspell command-line spellchecker via a system call.
+
+ Parameter toks is an array containg the tokens to be analyzed (strings, utf-8 encoding).
+
+ Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+ token_is_correct: True if the nth token was recognized, False otherwise
+ [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms, or [] if no suggestions available.
+
+ NOTE: comma characters in the input tokens are deleted before processing with the spellchecker.
+
+ Returns [None, <error message>] if some critical error occured.
+ Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+
+ """
+ toks = [x.replace(',', '') for x in toks] # kill commas in input
+ toks = [x.replace(':', ' ') for x in toks]
+ toks = [x.replace('_', ' ') for x in toks]
+ try:
+ inp = '\n'.join(toks)
+ p = subprocess.Popen(CMD_HUNSPELL, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ (stout, sterr) = p.communicate(inp)
+ if p.returncode != 0:
+ return [None, 'Command returned with exitcode {0}'.format(p.returncode)]
+ ret = []
+ c = 0
+ for line in stout.split('\n'):
+ line = line.rstrip()
+ if c == 0: # skip first line: hunspell banner
+ c += 1
+ continue
+ if not line: # skip empty line
+ continue
+ if line[0] in '*+-': # token correct
+ ret.append( (True, []) )
+ elif line.endswith(' 0'): # token incorrect, no suggestions
+ ret.append( (False, []) )
+ else: # token incorrect, suggestions available
+ m = re.match("^[^:]*: (.*)$", line)
+ ret.append( (False, m.group(1).split(", ")) )
+ return ret
+ except:
+ return [None, 'Exception: {0}'.format(sys.exc_info())]
+
+
+def union_humor_hunspell(unicode_toks):
+ """This is an experimental modification of union_humor_hunspell().
+ Calls only hunspell. Suggestions are filtered: skip if contains space and any token is 1 char long, keep only up to TOPNSUGG suggestions.
+ ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
+ regardless of either engines' outputs.
+ NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
+ Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+ token_is_correct: True if the nth token was recognized, False otherwise
+ [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
+ Returns [None, <error message>] if some critical error occured.
+ Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+ """
+ ret = []
+ # convert input to normal strings (utf8 encoding)
+ inp = [x.encode('utf8') for x in unicode_toks]
+ # call the engine
+ hunspout = call_hunspell_cmdline_ntok(inp)
+ # process
+ ret = hunspout
+ for i, x in enumerate(hunspout):
+ # override with exception dictionary (if applicable)
+ if len(hunspout) == len(inp): # safety check
+ e = SPELL_EXC_DICT.get(inp[i])
+ if e != None:
+ ret[i] = e
+ continue
+ # filter suggestions & convert them to unicode
+ if ret[i][0] == False:
+ tmp = []
+ for s in ret[i][1]:
+ t = unicode(s, 'utf8', 'replace')
+ sx = t.split(' ')
+ if len(sx) > 1 and len([x for x in sx if len(x) == 1]) > 0: # skip suggestion if contains space-delimited tokens and any token is only 1 char long (e.g. "u száj" for "uszáj")
+ continue
+ tmp.append(t)
+ if len(tmp) == TOPNSUGG: # no more than TOPNSUGG
+ break
+ ret[i] = (False, tmp)
+ return ret
+
+
+def union_humor_hunspell_DEPRECATED(unicode_toks):
+ """DEPRECATED. See union_humor_hunspell().
+ Calls humor + hunspell on elements of unicode_toks, which is an array of Unicode strings (tokens).
+ ALSO uses exception dictionary SPELL_EXC_DICT: if token is found, use value from there,
+ regardless of either engines' outputs.
+ NOTE: this differs from the humor and hunspell functions, which expect arrays of normal strings, not Unicode strings!
+ Returns an array: [(token_is_correct, [list_of_suggestions]), ...]
+ token_is_correct: True if the nth token was recognized, False otherwise
+ [list_of_suggestions]: [] if token_is_correct==True, otherwise may be a list of suggested correct forms (Unicode strings), or [] if no suggestions available.
+ Returns [None, <error message>] if some critical error occured.
+ Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+
+ token_is_correct: True if either of the 2 engines recognized the token.
+ list_of_suggestions: union of the 2 engines' suggestions.
+ """
+ ret = []
+ # convert input to normal strings (utf8 encoding)
+ inp = [x.encode('utf8') for x in unicode_toks]
+ # call the 2 engines
+ humorout = call_humor_spellchecker_cmdline_ntok(inp)
+ hunspout = call_hunspell_cmdline_ntok(inp)
+ if humorout[0] == None:
+ if hunspout[0] == None:
+ return [None, 'Double trouble: "{0}" + "{1}"'.format(humorout[1], hunspout[1])]
+ else:
+ return hunspout
+ if hunspout[0] == None:
+ return humorout # not None
+ if len(humorout) != len(hunspout): # neither is None: sanity check
+ return [None, 'Error: length of output from the 2 engines doesn''t match']
+ # merge them
+ ret = humorout
+ for i, x in enumerate(hunspout):
+ # --- hack: override with exception dictionary (if applicable)
+ if len(hunspout) == len(inp): # for safety
+ e = SPELL_EXC_DICT.get(inp[i])
+ if e != None:
+ ret[i] = e
+ continue
+ # --- hack end
+ if x[0] == True or ret[i][0] == True:
+ ret[i] = (True, [])
+ else:
+ # ret[i] = (False, list(set(x[1]+ret[i][1])))
+ # new merge method: add new hunspell suggestions to end of humor list
+ for s in x[1]:
+ if s not in ret[i][1]:
+ ret[i][1].append(s)
+ # convert suggestions to Unicode strings
+ for i in range(0, len(ret)):
+ if ret[i][0] == False:
+ ret[i] = (False, [unicode(x, 'utf8', 'replace') for x in ret[i][1]])
+ return ret
+
+
+def get_productive_tips(tok, hanas):
+ """Returns unicode string with tips on correct language usage.
+ Returns None if no tips can be associated.
+ tok: the input token (utf8 string)
+ hanas: an array of HumorAna objects representing analyses tok (utf8 string fields!)
+ """
+ if hanas == []: # no analyses: return None
+ return None
+ for ana in hanas:
+ if ana.pos == "IGE":
+ if re.match(".*[bcfghjklmnprstvz]d$", tok): # msh. + -d(d) vegu igek
+ for morph in ana.morphs:
+ # kezd ~ kezdd, mond ~ mondd
+ if morph.tag == "e3":
+ tip = u"Ő " + unicode(tok, 'utf8') + u" [valamit]. (Harmadik személy, kijelentő mód. Felszólító módban, egyes szám, második személyben két d-vel.)"
+ return tip
+ # nakolas
+ for morph in ana.morphs:
+ if "TFt3" in morph.tag:
+ tip = u"Ők " + unicode(tok, 'utf8') + u" azt."
+ return tip
+ # suksuk
+ if re.match(".*(szt|[^s]t)$", ana.stem): # -t es -szt vegu igek
+ for morph in ana.morphs:
+ if re.match("TP(e3|t[123])", morph.tag):
+ pronoun = PRONOUNS[morph.tag]
+ tip = u"Azt akarják, hogy ne " + unicode(tok, 'utf8') + u" tovább."
+ return tip
+ elif re.match(".*st$", ana.stem): # -st vegu igek
+ for morph in ana.morphs:
+ if "TPt1" in morph.tag:
+ pronoun = PRONOUNS[morph.tag]
+ tip = u"Azt akarják, hogy ne " + unicode(tok, 'utf8') + u" tovább."
+ return tip
+ elif ana.pos == "FN":
+ # det = u"az" if tok[0].lower() in VOWELS else u"a"
+ # -ba/-be; -ban/-ben
+ for morph in ana.morphs: # -bV
+ if "ILL" in morph.tag:
+ #tip = u"Hová? kérdésre válaszolva, pl. elmentem " + det + u" " + unicode(tok, 'utf8') + u"."
+ tip = u"Általában a Hová? kérdésre válaszolva."
+ return tip
+ elif "INE" in morph.tag: # -bVn
+ #tip = u"Hol? kérdésre válaszolva, pl. " + det + u" " + unicode(tok, 'utf8') + u" vagyok."
+ tip = u"Általában a Hol? kérdésre válaszolva."
+ return tip
+ return None
+
+
+def get_dictionary_tips(tok, hanas, db):
+ """Returns dictionary with keys as tip field names and values as tip field values
+ (keys and values are unicode strings) associated with input token tok.
+ Returns empty dict if no data could be found.
+ tok: an input token (utf8 string)
+ hanas: an array of HumorAna objects representing analyses of the input token
+ db: a gluon.DAL object representing an open database connection to dbdict DB
+ """
+ ret = {}
+ # assemble set of strings to query:
+ # 1. the surface form itself! e.g. gója => gója (surf), gó (*stem) ==> gólya :)
+ # 2. stems from humor
+ stems = set([tok])
+ for ana in hanas:
+ if ana.stem != '':
+ stems.add(ana.stem)
+ # 1. query paronyms with the stem(s)
+ for stem in stems:
+ sql = """SELECT * FROM sugg_types t
+ JOIN sugg_types_paronyms p ON t.id=p.id
+ WHERE t.actual={0}
+ """.format(escape_string(stem))
+ try:
+ res = db.executesql(sql, as_dict=True)
+ except: # any exception: return as unknown
+ return {}
+ # process
+ if len(res) > 0:
+ for k, v in res[0].items(): # TODO: what if len(res) > 1
+ if v != None:
+ ret[safe_unic(k)] = safe_unic(v)
+ break # TODO: what if >1 stem matches
+ # 2. query slangs with the surface form only
+ sql = """SELECT * FROM sugg_types t
+ JOIN sugg_types_slang s ON t.id=s.id
+ WHERE t.actual={0}
+ """.format(escape_string(tok))
+ try:
+ res = db.executesql(sql, as_dict=True)
+ except: # any exception: return as unknown
+ return {}
+ # process
+ if len(res) > 0:
+ for k, v in res[0].items(): # TODO: what if len(res) > 1
+ if v != None:
+ ret[safe_unic(k)] = safe_unic(v)
+ return ret
+
+
+def process(utoks, db):
+ """New main entry point function for module.
+ Param utoks: array of unicode strings, the input tokens
+ Param db: a gluon.DAL object representing an open database connection
+ Returns an array containing 3-tuples for each token:
+ [(token_is_correct, suggestions, tips), ...]
+ where:
+ token_is_correct: True if either of the 2 engines recognized the token
+ suggestions: list of unicode strings, union of the 2 engines' suggestions or [] if token_is_correct=True (or no suggestions available), see union_humor_hunspell()
+ tips is {field: value, ...} or {} (unicode strings)
+ Returns [None, <error message>] if some critical error occured.
+ Returns [] if input was emtpy array or contained no meaningful tokens (e.g. only punctuation).
+ """
+ ret = []
+ # call humor and hunspell suggestors
+ sugg = union_humor_hunspell(utoks)
+ if sugg[0] == None: # error?
+ return sugg
+ if len(sugg) != len(utoks): # sanity check
+ return [None, 'Error: len(sugg)!=len(utoks)']
+ if sugg == []: # no (meaningful) input
+ return []
+ for i, tok in enumerate(utoks):
+ # call humor for stemming and morph. analysis
+ tok8 = tok.encode('utf8').replace(',', '')
+ # check in HOMONYMS dict first
+ if tok8 in HOMONYMS:
+ ret.append( (True, [], {u'markmin': safe_unic(HOMONYMS[tok8])}) ) # 'markmin' key in tips: format in view with markmin
+ continue # no need for others
+ hanas = StemmingAnalysis(tok8).getAnas()
+ # get dictionary-based tips
+ tips = get_dictionary_tips(tok8, hanas, db)
+ # get productive tips
+ ptip = get_productive_tips(tok8, hanas)
+ if ptip != None:
+ tips['prod_tip'] = ptip
+ # merge with suggestions
+ ret.append( (sugg[i][0], sugg[i][1], tips) )
+ return ret
+
+
+def input_check(inp):
+ """Return True iff inp conforms to USRINP_REGEXP.
+ Note: inp should be unicode string!!!
+ """
+ if USRINP_REGEXP.match(inp): return True
+ else: return False
+
+
+def interactive_test(db):
+ """Interactive testing: user types a token to stdin, call analyzers and print results to stdout, repeat until blank line is entered.
+ """
+ print("Type words to analyze, blank line + <Enter> to exit")
+ while True:
+ inp = sys.stdin.readline().rstrip()
+ if not input_check(unicode(inp, 'utf8')):
+ print('A bemenet nem megengedett karakter(eke)t tartalmaz, kérjük, ellenőrizze! Csak a magyar ábécé kis- és nagybetűi, a számjegyek, a szóköz, a pont és a kötőjel, valamint a "%" karakter megengedettek.')
+ continue
+ if not inp: break
+ else: inp = inp.split(' ')
+ x = call_humor_spellchecker_cmdline_ntok(inp)
+ y = call_hunspell_cmdline_ntok(inp)
+ z = union_humor_hunspell([unicode(t, 'utf8') for t in inp])
+ p = process([unicode(t, 'utf8') for t in inp], db)
+ print('humor={0}\n'.format(x))
+ print('hunspell={0}\n'.format(y))
+ print('exc_dict={0}\n'.format([SPELL_EXC_DICT.get(x) for x in inp]))
+ print('unio={0}\n'.format(z))
+ print('process()={0}'.format(p))
+ print('')
+ return
+
+
+def safe_unic(s, enc='utf8'):
+ """If s is a string, convert it to unicode using enc encoding.
+ If s is already unicode, just return it.
+ This is a workaround: different versions of web2py return string/unicode in DB query.
+ Also, different column values are returned as either str of unicode depending on collation.
+ """
+ if isinstance(s, str):
+ return unicode(s, enc, 'replace')
+ elif isinstance(s, unicode):
+ return s
+ else: # ???
+ return unicode(str(s), enc, 'replace')
+
+
+if __name__ == '__main__':
+
+ print 'Connecting to database...'
+ db = DAL('mysql://dbdicter:dbdicter123@localhost/dbdict')
+ print 'Done'
+
+ interactive_test(db)
Added: trunk/web2py/applications/helyesiras_webdev/modules/spell_helpers.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/spell_helpers.py (rev 0)
+++ trunk/web2py/applications/helyesiras_webdev/modules/spell_helpers.py 2015-09-02 16:48:38 UTC (rev 1456)
@@ -0,0 +1,53 @@
+"""
+Helper functions for spell.py
+"""
+__author__ = 'mm'
+
+import hunspell
+import sys
+
+
+def load_spell_exc_dict(filename):
+ """
+ :param filename: name of tsv file with false positives
+ :return: dict (see SPELL_EXC_DICT_AKH11|2), may be empty
+ """
+ ret = {}
+ try:
+ for line in open(filename):
+ line = line.strip()
+ if not line or line.startswith('#'):
+ continue
+ r = line.split('\t')
+ if len(r) not in [2, 3]:
+ sys.stderr.write('spell.py: error in exceptions file: {0}\n'.format(line))
+ continue
+ if r[1] == '1':
+ ret[r[0]] = (True, [])
+ elif r[1] == '0' and len(r) == 3:
+ ret[r[0]] = (False, [r[2]])
+ except Exception as e:
+ sys.stderr.write('spell.py: exception while loading exceptions file {0}:\n{1}\n'.format(filename, str(e)))
+ return {}
+ return ret
+
+
+def init_pyhunspell(dicfile, afffile, customdictfile):
+ """
+ Initialize a hunspell.Hunspell object with the dic and aff file and load custom dictionary entries.
+ :param dicfile: name of .dic file
+ :param afffile: name of .aff file
+ :param customdictfile: name of custom dictionary file (lines: "unknown_word(\/known_analoguos_word)?"), UTF-8 encoding
+ :return: an initialized hunspell.Hunspell object
+ """
+ hs = hunspell.HunSpell(dicfile, afffile)
+ with open(customdictfile) as cdf:
+ for line in cdf:
+ fields = line.strip().split('/')
+ if len(fields) == 1:
+ hs.add(fields[0])
+ elif len(fields) == 2:
+ hs.add_with_affix(fields[0], fields[1])
+ else:
+ sys.stderr.write('Error in file {0}: "{1}"'.format(customdictfile, line))
+ return hs
More information about the Hejes-devel
mailing list