[Hejes-devel] [740] OH hits re-suffixated according to the original input.

Tue May 14 17:11:14 CEST 2013

Revision: 740
Author:   joker
Date:     2013-05-14 17:11:14 +0200 (Tue, 14 May 2013)
Log Message:
-----------
OH hits re-suffixated according to the original input. (Feature #447)

Modified Paths:
--------------
    trunk/web2py/applications/helyesiras_webdev/modules/egybekulon2.py

Modified: trunk/web2py/applications/helyesiras_webdev/modules/egybekulon2.py
===================================================================

--- trunk/web2py/applications/helyesiras_webdev/modules/egybekulon2.py	2013-05-14 14:53:00 UTC (rev 739)
+++ trunk/web2py/applications/helyesiras_webdev/modules/egybekulon2.py	2013-05-14 15:11:14 UTC (rev 740)
@@ -27,17 +27,6 @@
 _USRINP_REGEXP = re.compile( unicode( r'^[0-9a-zA-ZáéíóöőúüűÁÉÍÓÖŐÚÜŰ \-\.\n\t]+$', 'utf8' ) )
 """Regexp to check if user input contains only allowed characters. Use it on a Unicode object."""
 
-def normalize_input( toklst ):
-  """Normalizes input (for query_oh()):
-     generates a concatenated string without any spaces.
-     Input: a TokenList (processes self.toklst if nothing specified)
-     Returns: the concatenated string
-  """
-  s = ''
-  for i, t in enumerate(toklst.toks):
-    s += t.tok
-  return s
-
 class SegmentedInputUseCase:
   """Use case: 
 
@@ -79,6 +68,7 @@
   explain = '' # plain-text version of NL explanations set by calling getExplanations() with plaintxt=True
   explainx = '' # XHTML markup of NL explanations set by calling getExplanations()
   oh_hits = [] # lookup results in OH dict from DB
+  last_token_stems = [] # stems of last token (~head) for stemmed OH lookup
   
   def __init__(self,  dbconn):
     """Initialize this session.
@@ -218,24 +208,73 @@
        Normalization: join together tokens without spaces (for normalized querying in DB).
        Stemming: use stem by humor on last token (~head) instead of surface form. May have different stems, then return all possible combinations with previous tokens.
        Uses self.toklst.
+       Sets self.last_token_stems.
        Example: if input tokens were ["béke", "párt"] function returns ["békepár", "békepárt"]
     """
     if len(self.toklst.toks) == 0:
       return []
     # get possible stems of head
-    heads = []
+    self.last_token_stems = []
     for ana in self.toklst.toks[-1].humoranas:
       if ana.stem != '':
-        heads.append(ana.stem)
+        self.last_token_stems.append(ana.stem)
     # no stemming: use surface form of head (return 1 form only)
-    if heads == []:
+    if self.last_token_stems == []:
       return ''.join([t.tok for t in self.toklst.toks])
     # generate normalized forms using head stems
     ret = []
-    for head in heads:
+    for head in self.last_token_stems:
       ret.append(''.join([t.tok for t in self.toklst.toks[:-1]]) + head)
     return ret
 
+  def get_suffixated_forms(self, form=None):
+    """Return an OH hit re-suffixated according to original toklst.
+       Param: form is an OH hit
+       Returns: 
+       a set of re-suffixated forms
+       See Feature #447
+    """
+    ret = set([])
+    found_head = False
+    # if a head found at the and of form
+    for head in self.last_token_stems:
+      if form.endswith( head ):
+        ret.add( form[0:form.rfind( head )] + self.toklst.toks[-1].tok )
+        # form w/o stem of last token at end + original last token
+        found_head = True
+    # if NO head found at the and of form
+    # because form has a ' ' or a '-' inside the original head
+    if not found_head:
+      norm_input = ''.join([t.tok for t in self.toklst.toks]) # code repeat?
+      i = 0
+      j = 0
+      suffixated = ''
+      # compare norm_input and form char by char
+      while i < len(norm_input):
+        # read '' from form if we are at the end
+        try:
+          fj = form[j]
+        except:
+          fj = ''
+        if norm_input[i] == fj:
+          suffixated += fj
+          i += 1
+          j += 1
+        elif fj == ' ' or fj == '-':
+          suffixated += fj
+          j += 1
+        # if the difference is not a ' ' or a '-'
+        # then we are at the suffix,
+        # so take the ending of the original input!
+        else:
+          suffixated += norm_input[i:]
+          break
+      ret.add( suffixated )
+    if ret:
+      return ret
+    ret.add( form ) # return form if was not able to suffixate
+    return ret
+
   def query_oh(self, toklst=None):
     """Query OH to check whether user input is correct
        according to the dictionary.
@@ -247,15 +286,18 @@
     """
     if toklst == None:
       toklst = self.toklst
+    # stemming (+ normalization)
     norm_strings = self.get_normalized_forms()
-    self.oh_hits = []
     try:
       sql = 'SELECT actual FROM `ohdict` WHERE norm IN ({0});'.format(','.join([escape_string(x) for x in norm_strings]))
       result = self.dbconn.executesql(sql, as_dict=True) # e.g. [{actual='alma fa'}, {actual='almafa'}]
     except:
       return False # return safely from any DB error
+    # inverse of stemming + store
+    hits = set([])
     for t in result:
-      self.oh_hits.append( t['actual'] )
+      hits.update( self.get_suffixated_forms( t['actual'] ) )
+    self.oh_hits = list(hits)
     if len(self.oh_hits) < 1:
       return False
     return True