Initial commit

This commit is contained in:
2017-05-23 13:57:53 +03:00
commit 6badfbd103
38 changed files with 1286 additions and 0 deletions

0
src/ttl/__init__.py Normal file
View File

62
src/ttl/ttlparser.py Normal file
View File

@ -0,0 +1,62 @@
'''
Created on May 22, 2016
@author: tibi
'''
from xml.dom import minidom;
from xml.parsers.expat import ExpatError
from model.Word import Word
def parseText(xmlText):
words = []
chunks = {}
sentence_i = 0
# get the root "segs" element
try:
dom = minidom.parseString(xmlText)
except ExpatError as e:
print("Error in text:", xmlText)
print(e)
exit(-1)
alltext = dom.getElementsByTagName("segs")
# iterate paragraphs
for paragraph in alltext[0].getElementsByTagName("seg"):
# iterate sentences
for sentence in paragraph.getElementsByTagName("s"):
# increment sentence index
sentence_i += 1
word_i = 0
# iterate words
for word in sentence.getElementsByTagName("w"):
# increment word index
word_i += 1
# obtain word info
wordText = word.firstChild.data
lemma = word.getAttribute("lemma")
ana = word.getAttribute("ana")
chunk = word.getAttribute("chunk")
# create word
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
#words.append(w)
for c in chunk.split(","):
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
words.append(w)
if chunks.get((sentence_i, c)) == None:
chunks[(sentence_i, c)] = [ w ]
else:
chunks[(sentence_i, c)].append(w)
return (words, chunks)

34
src/ttl/ttlservice.py Normal file
View File

@ -0,0 +1,34 @@
# coding: utf-8
import zeep
def executeTtl(text):
# Preprocess the text
text = text.replace(u'ĭ', 'i')
text = text.replace(u'ŭ', 'u')
text = text.replace(u'à', 'a')
client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
textSgml = client.service.UTF8toSGML(text)
result = client.service.XCES("ro", "id", textSgml)
# Cleanup result - generate valid xml
result = result.replace('’', '`')
result = result.replace('ă', u'ă')
result = result.replace('à', u'à')
result = result.replace('â', u'â')
result = result.replace('î', u'î')
result = result.replace('ş', u'ș')
result = result.replace('ţ', u'ț')
result = result.replace('ŭ', u'u')
result = result.replace('Ă', u'Ă')
result = result.replace('À', u'À')
result = result.replace('Â', u'Â')
result = result.replace('Î', u'Î')
result = result.replace('Ş', u'Ș')
result = result.replace('Ţ', u'Ț')
result = result.replace('Ŭ', u'U')
xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
xmlResult += result
xmlResult += "</segs>"
return xmlResult