[Hejes-devel] [818] version control just for me
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Thu Jun 13 12:16:17 CEST 2013
Revision: 818
Author: hussami
Date: 2013-06-13 12:16:17 +0200 (Thu, 13 Jun 2013)
Log Message:
-----------
version control just for me
Modified Paths:
--------------
trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
Modified: trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py 2013-06-10 18:47:36 UTC (rev 817)
+++ trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py 2013-06-13 10:16:17 UTC (rev 818)
@@ -1,7 +1,12 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# coding: utf-8
+import sys
import MySQLdb
import re
+from egybekulon2_humor import StemmingAnalysis
+from egybekulon2_humor import HumorAna
class MySQLHandler:
def __init__(self, verbose = 0):
@@ -32,16 +37,52 @@
def fetchall(self):
return self.cursor.fetchall()
+##########
+class OHTerms:
+
+ def __init__(self, handler):
+ self.mysqlhandler = handler
+ self.data = {}
+ self.fill()
+
+ def fill(self):
+ query = "select distinct term from incidences"
+ self.mysqlhandler.execute(query)
+ self.data = {}
+ results = self.mysqlhandler.fetchall()
+ for row in results:
+ self.data[row[0]] = 0
+
+ def isMember(self, s):
+ if s in self.data:
+ return True
+ return False
+
+##################
+
class OHAnalyze:
def __init__(self):
self.db = MySQLHandler()
self.db.connect("localhost", "dbdicter", "dbdicter123", "dbdict")
+ self.splittags = {'MN': 0, 'FN': 1, 'IK': 2, 'IGE': 3}
+ # FIXME test code!
+ self.ohterms = OHTerms(self.db)
+
def bye(self):
if self.db != None:
self.db.disconnect()
+ def output(self, s, where):
+ if where == 0: #dump to text
+ print s + ";"
+ elif where == 1:
+ query = s;
+ self.db.execute(query)
+ else:
+ pass
+
def tokenize(self, s):
"""Split a string into tokens
Param: the target string
@@ -78,26 +119,89 @@
return result
- def try1(self):
- print "use dbdict;"
- print "drop table incidences;"
- print "create table incidences(term varchar(100), dict_id int, " + \
- "idx int);"
- query = "select id, actual, norm from ohdict where id < 20";
+ def getAllSplits(self, s, level):
+ result = []
+ ll = len(s)
+ for i in range(2, ll + 1): #ignore single-letter entries
+ if self.ohterms.isMember(s[:i]):
+# print " "*2*level + "MEMBER: " + s[:i]
+ if i == ll:
+ locresult = []
+ locresult.append(i)
+ result.append(locresult)
+# print " "*2*level + "APPEND: " + str(locresult)
+ continue
+
+ t = self.getAllSplits(s[i:], level+1)
+ if t:
+ for resit in t:
+ locresult = []
+ locresult.append(i)
+ locresult.extend(resit)
+ result.append(locresult)
+# print " "*2*level + "APPEND: " + str(locresult)
+# print " "*2*level + str(t)
+# print " "*2*level + str(result)
+ return result
+
+ def getHumorSplits(self, s):
+ h = StemmingAnalysis(s.rstrip())
+ print "'{0}'".format(s), ":", len(h.getAnas())
+ result = []
+ for x in h.getAnas():
+ locresult = []
+ tags = [y.tag for y in x.morphs]
+ forms = [y.lex for y in x.morphs]
+ if len(tags) != len(forms):
+ raise Exception(str(tags) + " vs " + str(forms))
+
+ if tags[-1] == 'NOM':
+ del forms[-1]
+ del tags[-1]
+
+ for i in range(0, len(tags)):
+ t = forms[i], tags[i]
+ locresult.append(t)
+
+ result.append(locresult)
+
+ print result
+# self.checkTags(l)
+
+ def try1(self, where):
+ if where == 0:
+ self.output("use dbdict", where)
+ self.output("drop table incidences", where)
+ self.output("create table incidences(term varchar(100), dict_id int, " + \
+ "idx int)", where)
+ self.output("create index incidence_index on incidences(term)", where)
+
+ query = "select id, actual, norm from ohdict where id > 1000 limit 100";
self.db.execute(query)
+
results = self.db.fetchall()
+ counter = 0
for row in results:
+ counter += 1
news = self.tokenize(row[1])
idval = int(row[0])
il = self.indexate(news, row[2])
for ne, nind in il:
- print "insert into incidences(term, dict_id, index) values('" + \
- ne + "', " + str(idval) + ", " + str(nind) + ");"
+ self.output("insert into incidences(term, dict_id, idx) values('" + \
+ ne + "', " + str(idval) + ", " + str(nind) + ")", where)
+ self.getAllSplits(ne)
+ if not where:
+ self.db.connection.commit()
+
+
oh = OHAnalyze()
-oh.try1()
+#oh.try1(2)
+#oh.getAllSplits("trolibusz", 0)
+print oh.getAllSplits("agyagtalajhumorista", 0)
+print oh.getAllSplits("virslitad", 0)
oh.bye()
###
More information about the Hejes-devel
mailing list