Initial commit
This commit is contained in:
0
src/ttl/__init__.py
Normal file
0
src/ttl/__init__.py
Normal file
62
src/ttl/ttlparser.py
Normal file
62
src/ttl/ttlparser.py
Normal file
@ -0,0 +1,62 @@
|
||||
'''
|
||||
Created on May 22, 2016
|
||||
|
||||
@author: tibi
|
||||
'''
|
||||
|
||||
from xml.dom import minidom;
|
||||
from xml.parsers.expat import ExpatError
|
||||
from model.Word import Word
|
||||
|
||||
def parseText(xmlText):
|
||||
|
||||
words = []
|
||||
chunks = {}
|
||||
|
||||
sentence_i = 0
|
||||
|
||||
# get the root "segs" element
|
||||
try:
|
||||
dom = minidom.parseString(xmlText)
|
||||
except ExpatError as e:
|
||||
print("Error in text:", xmlText)
|
||||
print(e)
|
||||
exit(-1)
|
||||
|
||||
alltext = dom.getElementsByTagName("segs")
|
||||
|
||||
# iterate paragraphs
|
||||
for paragraph in alltext[0].getElementsByTagName("seg"):
|
||||
|
||||
# iterate sentences
|
||||
for sentence in paragraph.getElementsByTagName("s"):
|
||||
|
||||
# increment sentence index
|
||||
sentence_i += 1
|
||||
word_i = 0
|
||||
|
||||
# iterate words
|
||||
for word in sentence.getElementsByTagName("w"):
|
||||
|
||||
# increment word index
|
||||
word_i += 1
|
||||
|
||||
# obtain word info
|
||||
wordText = word.firstChild.data
|
||||
lemma = word.getAttribute("lemma")
|
||||
ana = word.getAttribute("ana")
|
||||
chunk = word.getAttribute("chunk")
|
||||
|
||||
# create word
|
||||
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
|
||||
#words.append(w)
|
||||
|
||||
for c in chunk.split(","):
|
||||
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
|
||||
words.append(w)
|
||||
if chunks.get((sentence_i, c)) == None:
|
||||
chunks[(sentence_i, c)] = [ w ]
|
||||
else:
|
||||
chunks[(sentence_i, c)].append(w)
|
||||
|
||||
return (words, chunks)
|
34
src/ttl/ttlservice.py
Normal file
34
src/ttl/ttlservice.py
Normal file
@ -0,0 +1,34 @@
|
||||
# coding: utf-8
|
||||
import zeep
|
||||
|
||||
def executeTtl(text):
|
||||
# Preprocess the text
|
||||
text = text.replace(u'ĭ', 'i')
|
||||
text = text.replace(u'ŭ', 'u')
|
||||
text = text.replace(u'à', 'a')
|
||||
|
||||
client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
|
||||
textSgml = client.service.UTF8toSGML(text)
|
||||
result = client.service.XCES("ro", "id", textSgml)
|
||||
|
||||
# Cleanup result - generate valid xml
|
||||
result = result.replace('’', '`')
|
||||
result = result.replace('ă', u'ă')
|
||||
result = result.replace('à', u'à')
|
||||
result = result.replace('â', u'â')
|
||||
result = result.replace('î', u'î')
|
||||
result = result.replace('ş', u'ș')
|
||||
result = result.replace('ţ', u'ț')
|
||||
result = result.replace('ŭ', u'u')
|
||||
result = result.replace('Ă', u'Ă')
|
||||
result = result.replace('À', u'À')
|
||||
result = result.replace('Â', u'Â')
|
||||
result = result.replace('Î', u'Î')
|
||||
result = result.replace('Ş', u'Ș')
|
||||
result = result.replace('Ţ', u'Ț')
|
||||
result = result.replace('Ŭ', u'U')
|
||||
|
||||
xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
|
||||
xmlResult += result
|
||||
xmlResult += "</segs>"
|
||||
return xmlResult
|
Reference in New Issue
Block a user