[Hejes-devel] [824] osiris analysis script check-in and move.

Wed Jun 19 11:14:28 CEST 2013

Revision: 824
Author:   hussami
Date:     2013-06-19 11:14:27 +0200 (Wed, 19 Jun 2013)
Log Message:
-----------
osiris analysis script check-in and move. this version works ok on the Zsofi problem set, but need further specs.

Removed Paths:
-------------
    trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py

Deleted: trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
===================================================================

--- trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py	2013-06-18 13:49:09 UTC (rev 823)
+++ trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py	2013-06-19 09:14:27 UTC (rev 824)
@@ -1,330 +0,0 @@
-#!/usr/bin/env python
-"""# -*- coding: utf-8 -*- """
-# coding: utf8
-
-import sys
-import MySQLdb
-import re
-from egybekulon2_humor import StemmingAnalysis
-from egybekulon2_humor import HumorAna
-
-class MySQLHandler:
-  def __init__(self, verbose = 0):
-    self.connection = None
-    self.verbose = verbose
-    self.cursor = None
-    self.clear()
-
-  def clear(self):
-    self.disconnect()
-    self.cursor = None
-
-  def connect(self, server, user, pwd, dbs):
-    self.connection = MySQLdb.connect(host=server, user=user, passwd=pwd, \
-      db=dbs, charset='utf8')
-
-  def disconnect(self):
-    if self.connection != None:
-	self.connection.close()
-    self.connection = None
-
-  def execute(self, str):
-    if self.verbose > 0:
-	print str
-
-    self.cursor = self.connection.cursor()
-    self.cursor.execute(str)
-
-  def fetchall(self):
-    return self.cursor.fetchall()
-
-##########
-
-class OHTerms:
-  
-  def __init__(self, handler):
-    self.mysqlhandler = handler
-    self.data = {}
-    self.fill()
-
-  def fill(self):
-    query = "select distinct term from incidences"
-    self.mysqlhandler.execute(query)
-    self.data = {}
-    results = self.mysqlhandler.fetchall()
-#   c=0
-    for row in results:
-#     c+=1
-#      self.data[row[0]] = 0
-      self.data[row[0].encode("utf8")] = 0
-#     if c == 300:
-#       print row[0].encode("utf8")
-#       print self.data
-#      if row[0].find("agyag") > -1:
-#        print row[0]
-
-  def isMember(self, s):
-    if s in self.data:
-      return True
-    return False
-
-##################
-
-class OHAnalyze:
-  def __init__(self):
-    self.db = MySQLHandler()
-    self.db.connect("localhost", "dbdicter", "dbdicter123", "dbdict")
-    self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
-
-    # FIXME test code!
-    self.ohterms = OHTerms(self.db)
-
-  def bye(self):
-    if self.db != None:
-      self.db.disconnect()
-
-  def output(self, s, where):
-    if where == 0: #dump to text
-      print s + ";"
-    elif where == 1:
-      query = s;
-      self.db.execute(query)
-    else:
-      pass
-
-  def tokenize(self, s):
-    """Split a string into tokens
-       Param: the target string
-       Returns: the result list
-    """
-    l = re.split(' |;|-', s)
-    result = []
-    for sub in l:
-#      if sub.isdigit():
-#        result.append("_NUMBER_")
-#      else:
-      result.append(sub)
-    return result
-
-  def indexate(self, lst, norm):
-    """Finds an ordered set of strings in a normalized string
-       Param: the list and the normalized string
-       Returns: a list of tuples
-       Could be done differently, but it tests DB consistency somewhat.
-    """
-    mindex = 0
-    result = []
-    for l in lst:
-      ind = norm.find(l, mindex)
-      if mindex == -1:
-        raise 'Problem: ' + norm
-      #convert numbers!
-  #    if l.isdigit():
-  #      t = "_NUMBER_", ind
-  #    else:
-      t = l, ind
-      result.append(t)
-      mindex = ind + 1;
-
-    return result
-
-  def getAllSplits(self, s, level):
-    result = []
-    ll = len(s)
-    for i in range(2, ll + 1): #ignore single-letter entries
-      if self.ohterms.isMember(s[:i]):
-#        print " "*2*level + "MEMBER: " + s[:i]
-        if i == ll:
-          locresult = []
-          locresult.append(i)
-          result.append(locresult)
-#          print " "*2*level + "APPEND: " + str(locresult)
-          continue
-
-        t = self.getAllSplits(s[i:], level+1)
-        if t:
-          for resit in t:
-            locresult = []
-            locresult.append(i)
-            locresult.extend(resit)
-            result.append(locresult)
-#            print " "*2*level + "APPEND: " + str(locresult)
-#        print " "*2*level + str(t)
-#    print " "*2*level + str(result)
-    if level == 0 and not result:
-      result.append([len(s)])
-    return result
-
-  def filterTags(self, lst):
-    killables = []
-    for anali in range(0, len(lst)):
-      for lex, tag in lst[anali]:
-        if tag not in self.splittags:
-          killables.append(anali)
-          break
-    for ind in reversed(killables):
-      del lst[ind]
-    return lst
-
-  def humorFilter(self, s, splitindices): #filter bad splits
-    killables = []
-    for i in range(0, len(splitindices)): #for all splits
-      if len(splitindices[i]) == 1:
-        continue
-
-      previndex = 0
-      good = True
-      for j in splitindices[i]: #for all entries in the split
-        curindex = previndex + j
-#        print "  " + str(previndex), str(curindex)
-        hl = self.humorize(s[previndex:curindex])
-        if not hl:
-          killables.append(i)
-          break
-        previndex = curindex
-
-        self.filterTags(hl)
-        if not hl:
-          good = False
-          break
-      if not good:
-        killables.append(i)
-
-    for kitem in reversed(killables):
-      del splitindices[kitem]
-
-    if not splitindices:
-      splitindices = [[len(s)]]
-    return splitindices
-
-
-  def humorize(self, s):
-    h = StemmingAnalysis(s.rstrip())
-#    print "'{0}'".format(s), ":", len(h.getAnas())
-    result = []
-    for x in h.getAnas():
-      locresult = []
-      tags = [y.tag for y in x.morphs]
-      forms = [y.lex for y in x.morphs]
-      if len(tags) != len(forms):
-        raise Exception(str(tags) + " vs " + str(forms))
-
-      if tags[-1] == 'NOM':
-        del forms[-1]
-        del tags[-1]
-
-      for i in range(0, len(tags)):
-        t = forms[i], tags[i]
-        locresult.append(t)
-
-      result.append(locresult)
-
-#    print result
-    return result
-#      self.checkTags(l)
-
-
-  def getHumorSplits(self, s):
-    h = StemmingAnalysis(s.rstrip())
-    print "'{0}'".format(s), ":", len(h.getAnas())
-    result = []
-    for x in h.getAnas():
-      locresult = []
-      tags = [y.tag for y in x.morphs]
-      forms = [y.lex for y in x.morphs]
-      if len(tags) != len(forms):
-        raise Exception(str(tags) + " vs " + str(forms))
-
-      if tags[-1] == 'NOM':
-        del forms[-1]
-        del tags[-1]
-
-      for i in range(0, len(tags)):
-        t = forms[i], tags[i]
-        locresult.append(t)
-
-      result.append(locresult)
-
-#    print result
-#      self.checkTags(l)
-    return result
-
-  def makeIncidences(self, where):
-    if where == 0:
-      self.output("use dbdict", where)
-    self.output("drop table incidences", where)
-    self.output("create table incidences(term varchar(100), dict_id int, " + \
-      "idx int)", where)
-    self.output("create index incidence_index on incidences(term)", where)
-
-    query = "select id, actual, norm from ohdict";
-    self.db.execute(query)
-
-    results = self.db.fetchall()
-
-    counter = 0
-    for row in results:
-      counter += 1
-      news = self.tokenize(row[1])
-      idval = int(row[0])
-      il = self.indexate(news, row[2])
-      for ne, nind in il:
-        self.output("insert into incidences(term, dict_id, idx) values('" + \
-          ne + "', " + str(idval) + ", " + str(nind) + ")", where)
-    if where == 1:
-      self.db.connection.commit()
-
-  def printSplits(self, s, splits):
-    print s
-    for sp in splits:
-      previndex = 0
-      prl = []
-      for v in sp:
-        curindex = previndex + v
-        prl.append(s[previndex:curindex])
-        previndex = v
-      print "\t" + "+".join(prl)
-
-  def try2(self, where):
-
-    query = "select id, actual, norm from ohdict where id > 1000 limit 100";
-    self.db.execute(query)
-
-    results = self.db.fetchall()
-
-    counter = 0
-    for row in results:
-      counter += 1
-      news = self.tokenize(row[1])
-      idval = int(row[0])
-      il = self.indexate(news, row[2])
-      for ne, nind in il:
-#        self.getHumorSplits(ne)
-        termsplit = self.getAllSplits(ne, 0)
-#        print ne
-#        print termsplit
-        split = self.humorFilter(ne, termsplit)
-#        print split
-        self.printSplits(ne, split)
-
-
-
-oh = OHAnalyze()
-#oh.makeIncidences(2)
-oh.try2(2)
-#oh.getAllSplits("trolibusz", 0)
-
-#print oh.getAllSplits("agyagtalajhumorista", 0)
-#ll=oh.getAllSplits("agyagtalajhumorista", 0)
-#oh.humorFilter("agyagtalajhumorista", ll)
-#print oh.getAllSplits("virslitad", 0)
-#print "---"
-ll = oh.getAllSplits("agyagos", 0)
-#print ll
-oh.humorFilter("agyagos", ll)
-oh.bye()
-
-###
-# NOTE: Number conversions!!!
-###
-print "REMINDER: have a list of banned morphemes, i.e. letters etc., test for these in getAllSplits"