[Hejes-devel] [819] getting there.
hejes-devel at nytud.hu
hejes-devel at nytud.hu
Fri Jun 14 14:26:57 CEST 2013
Revision: 819
Author: hussami
Date: 2013-06-14 14:26:57 +0200 (Fri, 14 Jun 2013)
Log Message:
-----------
getting there. now have splits for each term in the actual field, just need to cartesinate them..
Modified Paths:
--------------
trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
Modified: trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py
===================================================================
--- trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py 2013-06-13 10:16:17 UTC (rev 818)
+++ trunk/web2py/applications/helyesiras_webdev/modules/ohanalyze.py 2013-06-14 12:26:57 UTC (rev 819)
@@ -1,6 +1,6 @@
#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# coding: utf-8
+"""# -*- coding: utf-8 -*- """
+# coding: utf8
import sys
import MySQLdb
@@ -19,8 +19,9 @@
self.disconnect()
self.cursor = None
- def connect(self, server, user, pwd, db):
- self.connection = MySQLdb.connect(server, user, pwd, db)
+ def connect(self, server, user, pwd, dbs):
+ self.connection = MySQLdb.connect(host=server, user=user, passwd=pwd, \
+ db=dbs, charset='utf8')
def disconnect(self):
if self.connection != None:
@@ -51,8 +52,16 @@
self.mysqlhandler.execute(query)
self.data = {}
results = self.mysqlhandler.fetchall()
+# c=0
for row in results:
- self.data[row[0]] = 0
+# c+=1
+# self.data[row[0]] = 0
+ self.data[row[0].encode("utf8")] = 0
+# if c == 300:
+# print row[0].encode("utf8")
+# print self.data
+# if row[0].find("agyag") > -1:
+# print row[0]
def isMember(self, s):
if s in self.data:
@@ -142,8 +151,79 @@
# print " "*2*level + "APPEND: " + str(locresult)
# print " "*2*level + str(t)
# print " "*2*level + str(result)
+ if level == 0 and not result:
+ result.append([len(s)])
return result
+ def filterTags(self, lst):
+ killables = []
+ for anali in range(0, len(lst)):
+ for lex, tag in lst[anali]:
+ if tag not in self.splittags:
+ killables.append(anali)
+ break
+ for ind in reversed(killables):
+ del lst[ind]
+ return lst
+
+ def humorFilter(self, s, splitindices): #filter bad splits
+ killables = []
+ for i in range(0, len(splitindices)): #for all splits
+ if len(splitindices[i]) == 1:
+ continue
+
+ previndex = 0
+ good = True
+ for j in splitindices[i]: #for all entries in the split
+ curindex = previndex + j
+# print " " + str(previndex), str(curindex)
+ hl = self.humorize(s[previndex:curindex])
+ if not hl:
+ killables.append(i)
+ break
+ previndex = curindex
+
+ self.filterTags(hl)
+ if not hl:
+ good = False
+ break
+ if not good:
+ killables.append(i)
+
+ for kitem in reversed(killables):
+ del splitindices[kitem]
+
+ if not splitindices:
+ splitindices = [[len(s)]]
+ return splitindices
+
+
+ def humorize(self, s):
+ h = StemmingAnalysis(s.rstrip())
+# print "'{0}'".format(s), ":", len(h.getAnas())
+ result = []
+ for x in h.getAnas():
+ locresult = []
+ tags = [y.tag for y in x.morphs]
+ forms = [y.lex for y in x.morphs]
+ if len(tags) != len(forms):
+ raise Exception(str(tags) + " vs " + str(forms))
+
+ if tags[-1] == 'NOM':
+ del forms[-1]
+ del tags[-1]
+
+ for i in range(0, len(tags)):
+ t = forms[i], tags[i]
+ locresult.append(t)
+
+ result.append(locresult)
+
+# print result
+ return result
+# self.checkTags(l)
+
+
def getHumorSplits(self, s):
h = StemmingAnalysis(s.rstrip())
print "'{0}'".format(s), ":", len(h.getAnas())
@@ -165,10 +245,11 @@
result.append(locresult)
- print result
+# print result
# self.checkTags(l)
+ return result
- def try1(self, where):
+ def makeIncidences(self, where):
if where == 0:
self.output("use dbdict", where)
self.output("drop table incidences", where)
@@ -176,7 +257,7 @@
"idx int)", where)
self.output("create index incidence_index on incidences(term)", where)
- query = "select id, actual, norm from ohdict where id > 1000 limit 100";
+ query = "select id, actual, norm from ohdict";
self.db.execute(query)
results = self.db.fetchall()
@@ -190,20 +271,60 @@
for ne, nind in il:
self.output("insert into incidences(term, dict_id, idx) values('" + \
ne + "', " + str(idval) + ", " + str(nind) + ")", where)
+ if where == 1:
+ self.db.connection.commit()
- self.getAllSplits(ne)
+ def printSplits(self, s, splits):
+ print s
+ for sp in splits:
+ previndex = 0
+ prl = []
+ for v in sp:
+ curindex = previndex + v
+ prl.append(s[previndex:curindex])
+ previndex = v
+ print "\t" + "+".join(prl)
- if not where:
- self.db.connection.commit()
+ def try2(self, where):
+ query = "select id, actual, norm from ohdict where id > 1000 limit 100";
+ self.db.execute(query)
+ results = self.db.fetchall()
+
+ counter = 0
+ for row in results:
+ counter += 1
+ news = self.tokenize(row[1])
+ idval = int(row[0])
+ il = self.indexate(news, row[2])
+ for ne, nind in il:
+# self.getHumorSplits(ne)
+ termsplit = self.getAllSplits(ne, 0)
+# print ne
+# print termsplit
+ split = self.humorFilter(ne, termsplit)
+# print split
+ self.printSplits(ne, split)
+
+
+
oh = OHAnalyze()
-#oh.try1(2)
+#oh.makeIncidences(2)
+oh.try2(2)
#oh.getAllSplits("trolibusz", 0)
-print oh.getAllSplits("agyagtalajhumorista", 0)
-print oh.getAllSplits("virslitad", 0)
+
+#print oh.getAllSplits("agyagtalajhumorista", 0)
+#ll=oh.getAllSplits("agyagtalajhumorista", 0)
+#oh.humorFilter("agyagtalajhumorista", ll)
+#print oh.getAllSplits("virslitad", 0)
+#print "---"
+ll = oh.getAllSplits("agyagos", 0)
+#print ll
+oh.humorFilter("agyagos", ll)
oh.bye()
###
# NOTE: Number conversions!!!
###
+print "REMINDER: have a list of banned morphemes, i.e. letters etc., test for these in getAllSplits"
More information about the Hejes-devel
mailing list