[Hejes-devel] [740] OH hits re-suffixated according to the original input.
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Tue May 14 17:11:14 CEST 2013
Revision: 740
Author: joker
Date: 2013-05-14 17:11:14 +0200 (Tue, 14 May 2013)
Log Message:
-----------
OH hits re-suffixated according to the original input. (Feature #447)
Modified Paths:
--------------
trunk/web2py/applications/helyesiras_webdev/modules/egybekulon2.py
Modified: trunk/web2py/applications/helyesiras_webdev/modules/egybekulon2.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/egybekulon2.py 2013-05-14 14:53:00 UTC (rev 739)
+++ trunk/web2py/applications/helyesiras_webdev/modules/egybekulon2.py 2013-05-14 15:11:14 UTC (rev 740)
@@ -27,17 +27,6 @@
_USRINP_REGEXP = re.compile( unicode( r'^[0-9a-zA-ZáéíóöőúüűÁÉÍÓÖŐÚÜŰ \-\.\n\t]+$', 'utf8' ) )
"""Regexp to check if user input contains only allowed characters. Use it on a Unicode object."""
-def normalize_input( toklst ):
- """Normalizes input (for query_oh()):
- generates a concatenated string without any spaces.
- Input: a TokenList (processes self.toklst if nothing specified)
- Returns: the concatenated string
- """
- s = ''
- for i, t in enumerate(toklst.toks):
- s += t.tok
- return s
-
class SegmentedInputUseCase:
"""Use case:
@@ -79,6 +68,7 @@
explain = '' # plain-text version of NL explanations set by calling getExplanations() with plaintxt=True
explainx = '' # XHTML markup of NL explanations set by calling getExplanations()
oh_hits = [] # lookup results in OH dict from DB
+ last_token_stems = [] # stems of last token (~head) for stemmed OH lookup
def __init__(self, dbconn):
"""Initialize this session.
@@ -218,24 +208,73 @@
Normalization: join together tokens without spaces (for normalized querying in DB).
Stemming: use stem by humor on last token (~head) instead of surface form. May have different stems, then return all possible combinations with previous tokens.
Uses self.toklst.
+ Sets self.last_token_stems.
Example: if input tokens were ["béke", "párt"] function returns ["békepár", "békepárt"]
"""
if len(self.toklst.toks) == 0:
return []
# get possible stems of head
- heads = []
+ self.last_token_stems = []
for ana in self.toklst.toks[-1].humoranas:
if ana.stem != '':
- heads.append(ana.stem)
+ self.last_token_stems.append(ana.stem)
# no stemming: use surface form of head (return 1 form only)
- if heads == []:
+ if self.last_token_stems == []:
return ''.join([t.tok for t in self.toklst.toks])
# generate normalized forms using head stems
ret = []
- for head in heads:
+ for head in self.last_token_stems:
ret.append(''.join([t.tok for t in self.toklst.toks[:-1]]) + head)
return ret
+ def get_suffixated_forms(self, form=None):
+ """Return an OH hit re-suffixated according to original toklst.
+ Param: form is an OH hit
+ Returns:
+ a set of re-suffixated forms
+ See Feature #447
+ """
+ ret = set([])
+ found_head = False
+ # if a head found at the and of form
+ for head in self.last_token_stems:
+ if form.endswith( head ):
+ ret.add( form[0:form.rfind( head )] + self.toklst.toks[-1].tok )
+ # form w/o stem of last token at end + original last token
+ found_head = True
+ # if NO head found at the and of form
+ # because form has a ' ' or a '-' inside the original head
+ if not found_head:
+ norm_input = ''.join([t.tok for t in self.toklst.toks]) # code repeat?
+ i = 0
+ j = 0
+ suffixated = ''
+ # compare norm_input and form char by char
+ while i < len(norm_input):
+ # read '' from form if we are at the end
+ try:
+ fj = form[j]
+ except:
+ fj = ''
+ if norm_input[i] == fj:
+ suffixated += fj
+ i += 1
+ j += 1
+ elif fj == ' ' or fj == '-':
+ suffixated += fj
+ j += 1
+ # if the difference is not a ' ' or a '-'
+ # then we are at the suffix,
+ # so take the ending of the original input!
+ else:
+ suffixated += norm_input[i:]
+ break
+ ret.add( suffixated )
+ if ret:
+ return ret
+ ret.add( form ) # return form if was not able to suffixate
+ return ret
+
def query_oh(self, toklst=None):
"""Query OH to check whether user input is correct
according to the dictionary.
@@ -247,15 +286,18 @@
"""
if toklst == None:
toklst = self.toklst
+ # stemming (+ normalization)
norm_strings = self.get_normalized_forms()
- self.oh_hits = []
try:
sql = 'SELECT actual FROM `ohdict` WHERE norm IN ({0});'.format(','.join([escape_string(x) for x in norm_strings]))
result = self.dbconn.executesql(sql, as_dict=True) # e.g. [{actual='alma fa'}, {actual='almafa'}]
except:
return False # return safely from any DB error
+ # inverse of stemming + store
+ hits = set([])
for t in result:
- self.oh_hits.append( t['actual'] )
+ hits.update( self.get_suffixated_forms( t['actual'] ) )
+ self.oh_hits = list(hits)
if len(self.oh_hits) < 1:
return False
return True
More information about the Hejes-devel
mailing list