[Hejes-devel] [824] osiris analysis script check-in and move.
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Wed Jun 19 11:14:28 CEST 2013
Revision: 824
Author: hussami
Date: 2013-06-19 11:14:27 +0200 (Wed, 19 Jun 2013)
Log Message:
-----------
osiris analysis script check-in and move. this version works ok on the Zsofi problem set, but need further specs.
Removed Paths:
-------------
trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
Deleted: trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py 2013-06-18 13:49:09 UTC (rev 823)
+++ trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py 2013-06-19 09:14:27 UTC (rev 824)
@@ -1,330 +0,0 @@
-#!/usr/bin/env python
-"""# -*- coding: utf-8 -*- """
-# coding: utf8
-
-import sys
-import MySQLdb
-import re
-from egybekulon2_humor import StemmingAnalysis
-from egybekulon2_humor import HumorAna
-
-class MySQLHandler:
- def __init__(self, verbose = 0):
- self.connection = None
- self.verbose = verbose
- self.cursor = None
- self.clear()
-
- def clear(self):
- self.disconnect()
- self.cursor = None
-
- def connect(self, server, user, pwd, dbs):
- self.connection = MySQLdb.connect(host=server, user=user, passwd=pwd, \
- db=dbs, charset='utf8')
-
- def disconnect(self):
- if self.connection != None:
- self.connection.close()
- self.connection = None
-
- def execute(self, str):
- if self.verbose > 0:
- print str
-
- self.cursor = self.connection.cursor()
- self.cursor.execute(str)
-
- def fetchall(self):
- return self.cursor.fetchall()
-
-##########
-
-class OHTerms:
-
- def __init__(self, handler):
- self.mysqlhandler = handler
- self.data = {}
- self.fill()
-
- def fill(self):
- query = "select distinct term from incidences"
- self.mysqlhandler.execute(query)
- self.data = {}
- results = self.mysqlhandler.fetchall()
-# c=0
- for row in results:
-# c+=1
-# self.data[row[0]] = 0
- self.data[row[0].encode("utf8")] = 0
-# if c == 300:
-# print row[0].encode("utf8")
-# print self.data
-# if row[0].find("agyag") > -1:
-# print row[0]
-
- def isMember(self, s):
- if s in self.data:
- return True
- return False
-
-##################
-
-class OHAnalyze:
- def __init__(self):
- self.db = MySQLHandler()
- self.db.connect("localhost", "dbdicter", "dbdicter123", "dbdict")
- self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
-
- # FIXME test code!
- self.ohterms = OHTerms(self.db)
-
- def bye(self):
- if self.db != None:
- self.db.disconnect()
-
- def output(self, s, where):
- if where == 0: #dump to text
- print s + ";"
- elif where == 1:
- query = s;
- self.db.execute(query)
- else:
- pass
-
- def tokenize(self, s):
- """Split a string into tokens
- Param: the target string
- Returns: the result list
- """
- l = re.split(' |;|-', s)
- result = []
- for sub in l:
-# if sub.isdigit():
-# result.append("_NUMBER_")
-# else:
- result.append(sub)
- return result
-
- def indexate(self, lst, norm):
- """Finds an ordered set of strings in a normalized string
- Param: the list and the normalized string
- Returns: a list of tuples
- Could be done differently, but it tests DB consistency somewhat.
- """
- mindex = 0
- result = []
- for l in lst:
- ind = norm.find(l, mindex)
- if mindex == -1:
- raise 'Problem: ' + norm
- #convert numbers!
- # if l.isdigit():
- # t = "_NUMBER_", ind
- # else:
- t = l, ind
- result.append(t)
- mindex = ind + 1;
-
- return result
-
- def getAllSplits(self, s, level):
- result = []
- ll = len(s)
- for i in range(2, ll + 1): #ignore single-letter entries
- if self.ohterms.isMember(s[:i]):
-# print " "*2*level + "MEMBER: " + s[:i]
- if i == ll:
- locresult = []
- locresult.append(i)
- result.append(locresult)
-# print " "*2*level + "APPEND: " + str(locresult)
- continue
-
- t = self.getAllSplits(s[i:], level+1)
- if t:
- for resit in t:
- locresult = []
- locresult.append(i)
- locresult.extend(resit)
- result.append(locresult)
-# print " "*2*level + "APPEND: " + str(locresult)
-# print " "*2*level + str(t)
-# print " "*2*level + str(result)
- if level == 0 and not result:
- result.append([len(s)])
- return result
-
- def filterTags(self, lst):
- killables = []
- for anali in range(0, len(lst)):
- for lex, tag in lst[anali]:
- if tag not in self.splittags:
- killables.append(anali)
- break
- for ind in reversed(killables):
- del lst[ind]
- return lst
-
- def humorFilter(self, s, splitindices): #filter bad splits
- killables = []
- for i in range(0, len(splitindices)): #for all splits
- if len(splitindices[i]) == 1:
- continue
-
- previndex = 0
- good = True
- for j in splitindices[i]: #for all entries in the split
- curindex = previndex + j
-# print " " + str(previndex), str(curindex)
- hl = self.humorize(s[previndex:curindex])
- if not hl:
- killables.append(i)
- break
- previndex = curindex
-
- self.filterTags(hl)
- if not hl:
- good = False
- break
- if not good:
- killables.append(i)
-
- for kitem in reversed(killables):
- del splitindices[kitem]
-
- if not splitindices:
- splitindices = [[len(s)]]
- return splitindices
-
-
- def humorize(self, s):
- h = StemmingAnalysis(s.rstrip())
-# print "'{0}'".format(s), ":", len(h.getAnas())
- result = []
- for x in h.getAnas():
- locresult = []
- tags = [y.tag for y in x.morphs]
- forms = [y.lex for y in x.morphs]
- if len(tags) != len(forms):
- raise Exception(str(tags) + " vs " + str(forms))
-
- if tags[-1] == 'NOM':
- del forms[-1]
- del tags[-1]
-
- for i in range(0, len(tags)):
- t = forms[i], tags[i]
- locresult.append(t)
-
- result.append(locresult)
-
-# print result
- return result
-# self.checkTags(l)
-
-
- def getHumorSplits(self, s):
- h = StemmingAnalysis(s.rstrip())
- print "'{0}'".format(s), ":", len(h.getAnas())
- result = []
- for x in h.getAnas():
- locresult = []
- tags = [y.tag for y in x.morphs]
- forms = [y.lex for y in x.morphs]
- if len(tags) != len(forms):
- raise Exception(str(tags) + " vs " + str(forms))
-
- if tags[-1] == 'NOM':
- del forms[-1]
- del tags[-1]
-
- for i in range(0, len(tags)):
- t = forms[i], tags[i]
- locresult.append(t)
-
- result.append(locresult)
-
-# print result
-# self.checkTags(l)
- return result
-
- def makeIncidences(self, where):
- if where == 0:
- self.output("use dbdict", where)
- self.output("drop table incidences", where)
- self.output("create table incidences(term varchar(100), dict_id int, " + \
- "idx int)", where)
- self.output("create index incidence_index on incidences(term)", where)
-
- query = "select id, actual, norm from ohdict";
- self.db.execute(query)
-
- results = self.db.fetchall()
-
- counter = 0
- for row in results:
- counter += 1
- news = self.tokenize(row[1])
- idval = int(row[0])
- il = self.indexate(news, row[2])
- for ne, nind in il:
- self.output("insert into incidences(term, dict_id, idx) values('" + \
- ne + "', " + str(idval) + ", " + str(nind) + ")", where)
- if where == 1:
- self.db.connection.commit()
-
- def printSplits(self, s, splits):
- print s
- for sp in splits:
- previndex = 0
- prl = []
- for v in sp:
- curindex = previndex + v
- prl.append(s[previndex:curindex])
- previndex = v
- print "\t" + "+".join(prl)
-
- def try2(self, where):
-
- query = "select id, actual, norm from ohdict where id > 1000 limit 100";
- self.db.execute(query)
-
- results = self.db.fetchall()
-
- counter = 0
- for row in results:
- counter += 1
- news = self.tokenize(row[1])
- idval = int(row[0])
- il = self.indexate(news, row[2])
- for ne, nind in il:
-# self.getHumorSplits(ne)
- termsplit = self.getAllSplits(ne, 0)
-# print ne
-# print termsplit
- split = self.humorFilter(ne, termsplit)
-# print split
- self.printSplits(ne, split)
-
-
-
-oh = OHAnalyze()
-#oh.makeIncidences(2)
-oh.try2(2)
-#oh.getAllSplits("trolibusz", 0)
-
-#print oh.getAllSplits("agyagtalajhumorista", 0)
-#ll=oh.getAllSplits("agyagtalajhumorista", 0)
-#oh.humorFilter("agyagtalajhumorista", ll)
-#print oh.getAllSplits("virslitad", 0)
-#print "---"
-ll = oh.getAllSplits("agyagos", 0)
-#print ll
-oh.humorFilter("agyagos", ll)
-oh.bye()
-
-###
-# NOTE: Number conversions!!!
-###
-print "REMINDER: have a list of banned morphemes, i.e. letters etc., test for these in getAllSplits"
More information about the Hejes-devel
mailing list