Initial commit

This commit is contained in:
Tiberiu Chibici 2017-05-23 13:57:53 +03:00
commit 6badfbd103
38 changed files with 1286 additions and 0 deletions

107
.gitignore vendored Normal file
View File

@ -0,0 +1,107 @@
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
logs/*
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/

220
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,220 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Run without debugging",
"type": "python",
"request": "launch",
"stopOnEntry": false,
"pythonPath": "${config:python.pythonPath}",
"program": "${file}",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Python",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${file}",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "PySpark",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"osx": {
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
},
"windows": {
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd"
},
"linux": {
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
},
"program": "${file}",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Python Module",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"module": "module.name",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Integrated Terminal/Console",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${file}",
"cwd": "",
"console": "integratedTerminal",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit"
]
},
{
"name": "External Terminal/Console",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${file}",
"cwd": "",
"console": "externalTerminal",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit"
]
},
{
"name": "Django",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${workspaceRoot}/manage.py",
"cwd": "${workspaceRoot}",
"args": [
"runserver",
"--noreload"
],
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput",
"DjangoDebugging"
]
},
{
"name": "Flask",
"type": "python",
"request": "launch",
"stopOnEntry": false,
"pythonPath": "${config:python.pythonPath}",
"program": "fully qualified path fo 'flask' executable. Generally located along with python interpreter",
"cwd": "${workspaceRoot}",
"env": {
"FLASK_APP": "${workspaceRoot}/quickstart/app.py"
},
"args": [
"run",
"--no-debugger",
"--no-reload"
],
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Flask (old)",
"type": "python",
"request": "launch",
"stopOnEntry": false,
"pythonPath": "${config:python.pythonPath}",
"program": "${workspaceRoot}/run.py",
"cwd": "${workspaceRoot}",
"args": [],
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Pyramid",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"args": [
"${workspaceRoot}/development.ini"
],
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput",
"Pyramid"
]
},
{
"name": "Watson",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${workspaceRoot}/console.py",
"cwd": "${workspaceRoot}",
"args": [
"dev",
"runserver",
"--noreload=True"
],
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Attach (Remote Debug)",
"type": "python",
"request": "attach",
"localRoot": "${workspaceRoot}",
"remoteRoot": "${workspaceRoot}",
"port": 3000,
"secret": "my_secret",
"host": "localhost"
}
]
}

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
// Place your settings in this file to overwrite default and user settings.
{
}

BIN
NLP.zip Normal file

Binary file not shown.

17
NLP/.project Normal file
View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>NLP</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.python.pydev.pythonNature</nature>
</natures>
</projectDescription>

8
NLP/.pydevproject Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?eclipse-pydev version="1.0"?><pydev_project>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/${PROJECT_DIR_NAME}/src</path>
</pydev_pathproperty>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 3.0</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 3.5</pydev_property>
</pydev_project>

1
NLP/data/text1.txt Normal file
View File

@ -0,0 +1 @@
Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase.

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<segs>
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Maria" ana="Np" chunk="Np#1">Maria</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="măr" ana="Ncfp-n" chunk="Np#2">mere</w><c>.</c></s></seg>
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="el" ana="Pp3fsr--------s">Ea</w><w lemma="mai" ana="Rp" chunk="Ap#1">mai</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="șapte" ana="Mc-p-l" chunk="Np#1">șapte</w><w lemma="pară" ana="Ncfp-n" chunk="Np#1">pere</w><c>.</c></s></seg>
<seg lang="ro"><s id="id_temp_aiurea.3"><w lemma="acesta" ana="Pd3fpr">Acestea</w><w lemma="fi" ana="Vmip3p" chunk="Vp#1">sunt</w><w lemma="foarte" ana="Rp" chunk="Ap#1,Vp#1">foarte</w><w lemma="delicios" ana="Afpfp-n" chunk="Ap#1">delicioase</w><c>.</c></s></seg>
</segs>

1
NLP/data/text2.txt Normal file
View File

@ -0,0 +1 @@
Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat.

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<segs>
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sabeer" ana="Np" chunk="Np#1">Sabeer</w><w lemma="Bhatia" ana="Np" chunk="Np#1">Bhatia</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="ajunge" ana="Vmp--sm" chunk="Vp#1">ajuns</w><w lemma="la" ana="Spsa" chunk="Pp#1">la</w><w lemma="aeroport" ana="Ncmsry" chunk="Pp#1,Np#2">Aeroportul</w><w lemma="internațional" ana="Afpms-n" chunk="Pp#1,Np#2,Ap#1">Internațional</w><w lemma="din" ana="Spsa" chunk="Pp#2">din</w><w lemma="Los" ana="Np" chunk="Pp#2,Np#3">Los</w><w lemma="Angeles" ana="Np" chunk="Pp#2,Np#3">Angeles</w><w lemma="la" ana="Spsa" chunk="Pp#3">la</w><w lemma="oră" ana="Ncfsry" chunk="Pp#3,Np#4">ora</w><w lemma="18" ana="Mc" chunk="Pp#3,Np#4">18</w><w lemma="în" ana="Spsa" chunk="Pp#4">în</w><w lemma="dată" ana="Ncfsry" chunk="Pp#4,Np#5">data</w><w lemma="de" ana="Spsa" chunk="Pp#5">de</w><w lemma="23" ana="Mc" chunk="Pp#5,Np#6">23</w><w lemma="septembrie" ana="Ncms-n" chunk="Pp#5,Np#6">septembrie</w><w lemma="1998" ana="Mc" chunk="Pp#5,Np#6">1998</w><c>.</c></s></seg>
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="zbor" ana="Ncmsry" chunk="Np#1">Zborul</w><w lemma="său" ana="Ds3ms-s" chunk="Np#1">său</w><w lemma="din" ana="Spsa" chunk="Pp#1">din</w><w lemma="Bangalore" ana="Np" chunk="Pp#1,Np#2">Bangalore</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dura" ana="Vmp--sm" chunk="Vp#1">durat</w><w lemma="22" ana="Mc" chunk="Np#3">22</w><w lemma="oră" ana="Ncfp-n" chunk="Np#3">ore</w><c>,</c><w lemma="și" ana="Crssp">și</w><w lemma="el" ana="Pp3msr--------s" chunk="Vp#2">el</w><w lemma="fi" ana="Vaii3s" chunk="Vp#2">era</w><w lemma="înfometa" ana="Vmp--sm" chunk="Vp#2">înfometat</w><c>.</c></s></seg>
</segs>

1
NLP/data/text3.txt Normal file
View File

@ -0,0 +1 @@
Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul.

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8" ?>
<segs>
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sophia" ana="Np" chunk="Np#1">Sophia</w><w lemma="Loren" ana="Np" chunk="Np#1">Loren</w><w lemma="spune" ana="Vmnp" chunk="Vp#1">spune</w><w lemma="că" ana="Csssp"></w><w lemma="el" ana="Pp3fsr--------s" chunk="Vp#2">ea</w><w lemma="vrea" ana="Va--3s" chunk="Vp#2">va</w><w lemma="fi" ana="Vmnp" chunk="Vp#2">fi</w><w lemma="întotdeauna" ana="Rgp" chunk="Vp#2,Ap#1">întotdeauna</w><w lemma="mulțumitor" ana="Afpf--n" chunk="Ap#1">mulțumitoare</w><w lemma="față_de" ana="Spca" chunk="Pp#1">față_de</w><w lemma="bonă" ana="Ncfsvy" chunk="Pp#1,Np#2">Bono</w><c>.</c></s></seg>
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="actriță" ana="Ncfsry" chunk="Np#1">Actrița</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dezvălui" ana="Vmp--sm" chunk="Vp#1">dezvăluit</w><w lemma="că" ana="Csssp"></w><w lemma="cântăreț" ana="Ncmsry" chunk="Np#2">cântărețul</w><w lemma="trupă" ana="Ncfsoy" chunk="Np#2">trupei</w><w lemma="U2" ana="Np" chunk="Np#2">U2</w><w lemma="avea" ana="Va--3s" chunk="Vp#2">a</w><w lemma="ajuta" ana="Vmp--sm" chunk="Vp#2">ajutat</w><w lemma="el" ana="Pp3fsa--y-----w">-o</w><w lemma="să" ana="Qs" chunk="Vp#3"></w><w lemma="sine" ana="Px3--a--------w" chunk="Vp#3">se</w><w lemma="liniști" ana="Vmsp3" chunk="Vp#3">liniștească</w><w lemma="atunci_când" ana="Rw" chunk="Vp#3,Ap#1">atunci_când</w><w lemma="el" ana="Pp3fsr--------s">ea</w><w lemma="sine" ana="Px3--a--y-----w" chunk="Vp#4">s-</w><w lemma="avea" ana="Va--3s" chunk="Vp#4">a</w><w lemma="speria" ana="Vmp--sm" chunk="Vp#4">speriat</w><w lemma="de" ana="Spsa" chunk="Pp#1">de</w><w lemma="un" ana="Tifsr" chunk="Pp#1,Np#3">o</w><w lemma="furtună" ana="Ncfsrn" chunk="Pp#1,Np#3">furtună</w><w lemma="în_timp_ce" ana="Cscsp">în_timp_ce</w><w lemma="zbura" ana="Vmii3p" chunk="Vp#5">zburau</w><w lemma="cu" ana="Spsa" chunk="Pp#2">cu</w><w lemma="avion" ana="Ncmsry" chunk="Pp#2,Np#4">avionul</w><c>.</c></s></seg>
</segs>

121
NLP/src/anaphora.py Normal file
View File

@ -0,0 +1,121 @@
'''
Created on May 22, 2016
@author: tibi
'''
from model import Word
def getGender(word):
if word.isPronoun() and (word.pronounGetPerson() == '1' or word.pronounGetPerson() == '2'):
return 'n'
return word.getGender()
def genderMatch(word1, word2):
g1 = getGender(word1)
g2 = getGender(word2)
if g1 == g2:
return 2
if g1 == 'n' or g2 == 'n':
return 1
return 0
def isPrepositional(chunk):
for word in chunk:
if word.isPreposition():
return True
return False
def countInText(noun, text):
c = 0
for word in text:
if word.text == noun.text:
c += 1
return c
def anaphora(text, chunks):
nounPhrases = []
for word in text:
if word.isNoun():
print("[n]", word)
nounPhrases.append((word, (word.sentenceIndex, word.chunk)))
else:
print(word)
if word.isPronoun():
candidates = []
for noun, chunkIndex in nounPhrases[:-30:-1]:
# If gender and number match
if genderMatch(word, noun) > 0 and word.getNumber() == noun.getNumber():
npInd = genderMatch(word, noun)
# definiteness
if not noun.nounIsDefinite():
npInd -= 1
# non-prepositional noun phrase
chunk = chunks[chunkIndex]
if (isPrepositional(chunk)):
npInd -= 1
# first in sentence
if noun.sentenceIndex == 1:
npInd += 1
# indicating verbs
# todo...
# lexical reiteration
c = countInText(noun, text)
if c == 2:
npInd += 1
if c > 2:
npInd += 2
# noun is representing term
# how?
# identical collocation pattern to the pronoun
# ???
# immediate reference, resolving 'it'
# applicable?
# referential distance
dist = word.sentenceIndex - noun.sentenceIndex
if dist == 0:
npInd += 2
elif dist == 1:
npInd += 1
candidates.append((noun, npInd))
print("...> Candidate: {0} npInd = {1}".format(noun, npInd))
if len(candidates) > 0:
pickedWord, pickedInd = candidates[0]
for word, npInd in candidates:
if npInd > pickedInd:
pickedInd = npInd
pickedWord = word
print(".>>> Picked: {0}".format(pickedWord))

55
NLP/src/fileparser.py Normal file
View File

@ -0,0 +1,55 @@
'''
Created on May 22, 2016
@author: tibi
'''
from xml.dom import minidom;
from model.Word import Word
def parse(filename):
words = []
chunks = {}
sentence_i = 0
# get the root "segs" element
dom = minidom.parse(filename)
alltext = dom.getElementsByTagName("segs")
# iterate paragraphs
for paragraph in alltext[0].getElementsByTagName("seg"):
# iterate sentences
for sentence in paragraph.getElementsByTagName("s"):
# increment sentence index
sentence_i += 1
word_i = 0
# iterate words
for word in sentence.getElementsByTagName("w"):
# increment word index
word_i += 1
# obtain word info
wordText = word.firstChild.data
lemma = word.getAttribute("lemma")
ana = word.getAttribute("ana")
chunk = word.getAttribute("chunk")
# create word
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
#words.append(w)
for c in chunk.split(","):
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
words.append(w)
if chunks.get((sentence_i, c)) == None:
chunks[(sentence_i, c)] = [ w ]
else:
chunks[(sentence_i, c)].append(w)
return (words, chunks)

26
NLP/src/main.py Normal file
View File

@ -0,0 +1,26 @@
'''
Created on May 22, 2016
@author: tibi
'''
import fileparser
from anaphora import anaphora
if __name__ == '__main__':
words, chunks = fileparser.parse("../data/text3_processed.xml")
print("Words:")
for word in words:
print("[{0} {1}] {2}".format(word.sentenceIndex, word.wordIndex, word))
print("")
print("Chunks:")
for key, value in chunks.items():
print(key, ":")
for word in value:
print(" - ", word)
print("")
print("Anaphora resolution:")
anaphora(words, chunks)

88
NLP/src/model/Word.py Normal file
View File

@ -0,0 +1,88 @@
'''
Created on May 22, 2016
@author: tibi
'''
class Word:
text = ""
lemma = ""
ana = ""
chunk = ""
sentenceIndex = 0
wordIndex = 0
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
self.text = text
self.lemma = lemma
self.ana = ana
self.chunk = chunk
self.sentenceIndex = sentenceIndex
self.wordIndex = wordIndex
def __str__(self):
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
def isNoun(self):
return self.ana[0] == "N"
def nounIsCommon(self):
return self.isNoun() and self.ana[1] == "c"
def nounIsProper(self):
return self.isNoun() and self.ana[1] == "p"
def nounGetCase(self):
if self.isNoun():
return self.ana[4]
return None
'Este articulat?'
def nounIsDefinite(self):
if self.isNoun():
if (self.nounIsProper()):
return True
if len(self.ana) > 5:
return self.ana[5]
return "n"
def pronounGetPerson(self):
if self.isPronoun():
return self.ana[2]
def getGender(self):
if self.isNoun():
if (len(self.ana) >= 3):
return self.ana[2]
return 'n'
if self.isPronoun():
return self.ana[3]
return None
def getNumber(self):
if self.isNoun():
if self.nounIsProper():
return 's'
else:
return self.ana[3]
if self.isPronoun():
return self.ana[4]
return None
def isPronoun(self):
return self.ana[0] == "P"
def isVerb(self):
return self.ana[0] == "V"
def isPreposition(self):
return self.ana[0] == "S" and self.ana[1] == "p"

View File

1
data/index.csv Normal file
View File

@ -0,0 +1 @@
books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania
1 books/Moara cu noroc - Ioan Slavici.epub Moara cu noroc 1880 Ioan Slavici 1848 Transilvania

BIN
data/results.db Normal file

Binary file not shown.

BIN
data/texts.db Normal file

Binary file not shown.

BIN
data/texts.db.bak Normal file

Binary file not shown.

BIN
data/texts2.db Normal file

Binary file not shown.

20
src/logger.py Normal file
View File

@ -0,0 +1,20 @@
import time
import logging
def init_logger(level):
# Log filename
tm = time.strftime('%Y-%m-%d_%H-%M-%S')
logFile = "logs/log_{0}.log".format(tm)
# Set up file logger
logging.basicConfig(filename=logFile,
level=logging.DEBUG,
format='%(asctime)s %(name)s %(levelname)s %(message)s',
datefmt='%m-%d %H:%M')
# Set up console logger
formatter = logging.Formatter('[%(name)s] %(levelname)s: %(message)s')
console = logging.StreamHandler()
console.setLevel(level)
console.setFormatter(formatter)
logging.getLogger().addHandler(console)

40
src/main.py Normal file
View File

@ -0,0 +1,40 @@
import logging
import time
# own
import logger
import storage.data
import storage.results
import textprocessor.letterfreq
import ttl.ttlparser
import ttl.ttlservice
def init():
logger.init_logger(logging.WARNING)
storage.data.initializeFragmentDatabase("data/texts.db")
storage.results.initializeResultsDatabase("data/results.db", True)
def processTexts():
count = storage.data.getTextCount()
current = 0
for item in storage.data.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# obtain ttl analysis
# unfeasable - it takes 5-10 minutes for a single text
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
# storage.results.storeTtlAnalysis(itemid, words)
# perform analysis
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
storage.results.storeFrequencies(itemid, letterFreq)
print("Finished!")
init()
processTexts()

32
src/model.py Normal file
View File

@ -0,0 +1,32 @@
# Defines a fragment author
class Author:
def __init__(self, name = "", birthYear = "", location = "Romania"):
self.name = name
self.yearOfBirth = birthYear
self.location = location
def __str__(self):
return self.name
def __repr__(self):
return self.name
def dump(self):
return "[Author name={0} yearOfBirth={1} location={2}]".format(self.name, self.yearOfBirth, self.location)
# Defines a text fragment
class Fragment:
def __init__(self, title = "", text = "", author = Author(), year = 1999):
self.title = title
self.text = text
self.author = author
self.year = year
def __str__(self):
return self.title
def __repr__(self):
return self.title
def dump(self):
return "[Fragment title={0} author={1} year={2} text={3}]".format(self.title, self.author.dump(), self.year, self.text)

87
src/model/Word.py Normal file
View File

@ -0,0 +1,87 @@
# Defines a processed word
class Word:
text = ""
lemma = ""
ana = ""
chunk = ""
sentenceIndex = 0
wordIndex = 0
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
self.text = text
self.lemma = lemma
self.ana = ana
self.chunk = chunk
self.sentenceIndex = sentenceIndex
self.wordIndex = wordIndex
def __str__(self):
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
def __repr__(self):
return str(self)
def isNoun(self):
return self.ana[0] == "N"
def nounIsCommon(self):
return self.isNoun() and self.ana[1] == "c"
def nounIsProper(self):
return self.isNoun() and self.ana[1] == "p"
def nounGetCase(self):
if self.isNoun():
return self.ana[4]
return None
'Este articulat?'
def nounIsDefinite(self):
if self.isNoun():
if (self.nounIsProper()):
return True
if len(self.ana) > 5:
return self.ana[5]
return "n"
def pronounGetPerson(self):
if self.isPronoun():
return self.ana[2]
def getGender(self):
if self.isNoun():
if (len(self.ana) >= 3):
return self.ana[2]
return 'n'
if self.isPronoun():
return self.ana[3]
return None
def getNumber(self):
if self.isNoun():
if self.nounIsProper():
return 's'
else:
return self.ana[3]
if self.isPronoun():
return self.ana[4]
return None
def isPronoun(self):
return self.ana[0] == "P"
def isVerb(self):
return self.ana[0] == "V"
def isPreposition(self):
return self.ana[0] == "S" and self.ana[1] == "p"

0
src/model/__init__.py Normal file
View File

0
src/storage/__init__.py Normal file
View File

80
src/storage/data.py Normal file
View File

@ -0,0 +1,80 @@
import logging
import os
from model import *
import sqlite3
log = logging.getLogger("storage")
DB_FRAGMENTS = ""
# Commands
# birth location - general area, not exact location (i.e. Transylvania)
# birth origin - rural or urban
# studies - masters, bachelors, high school, middle school, primary school
# occupation - comma separated if there are multiple
# studiesAbroad - foreign cities where author studied (comma separated)
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
name TEXT PRIMARY KEY,
birthYear INTEGER,
birthLocation TEXT,
birthOrigin TEXT,
studies TEXT,
occupations TEXT,
studiesAbroad TEXT
)"""
# genre - short story (nuvela), novel (roman), poem etc
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
# tags - other relevant information (i.e. psychological)
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
id INTEGER PRIMARY KEY,
title TEXT,
year INTEGER,
author TEXT REFERENCES Authors(name),
genre TEXT,
movement TEXT,
tags TEXT
)"""
# contains the actual text
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
id INTEGER REFERENCES Fragments(id),
content TEXT
)"""
# Initialize databases
def initializeFragmentDatabase(dbFile):
global DB_FRAGMENTS
DB_FRAGMENTS = dbFile
if not os.path.exists(dbFile):
log.info("Text database %s not found. Will create database.", dbFile)
con = sqlite3.connect(dbFile)
c = con.cursor()
c.execute(COMMAND_CREATE_AUTHORS)
c.execute(COMMAND_CREATE_FRAGMENTS)
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
con.commit()
con.close()
log.info("Database created!")
def getTextCount():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT COUNT(*) FROM Fragments")
item = c.fetchone()
c.close()
con.close()
return item[0]
def getAllTexts():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT id, content FROM FragmentsContent")
items = c.fetchall()
c.close()
con.close()
return items

84
src/storage/results.py Normal file
View File

@ -0,0 +1,84 @@
import logging
import os
from model.Word import *
import sqlite3
log = logging.getLogger("storage")
DB_RESULTS = ""
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
idtext INTEGER,
lettergroup TEXT,
category TEXT,
frequency REAL
)"""
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
idtext INTEGER,
wordIndex INTEGER,
sentenceIndex INTEGER,
word TEXT,
lemma TEXT,
analysis TEXT,
chunk TEXT
)"""
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
# idtext INTEGER,
# wordlength INTEGER,
# frequency REAL
# )"""
def initializeResultsDatabase(dbFile, cleanupOldData):
global DB_RESULTS
DB_RESULTS = dbFile
# cleanup old data
if cleanupOldData:
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
try:
c.execute("DROP TABLE LetterFrequencies")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
try:
c.execute("DROP TABLE TextWords")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_TEXT_WORDS)
con.commit()
c.close()
con.close()
def storeFrequencies(idtext, freq):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# add data
chr = ['p', 'l1', 'l2', 'l3']
for i in range(4):
for let, fr in freq[i]:
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
con.commit()
c.close()
con.close()
def storeTtlAnalysis(idtext, words):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# store words
for word in words:
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
# finish
con.commit()
c.close()
con.close()

14
src/test.py Normal file
View File

@ -0,0 +1,14 @@
# coding: utf-8
from ttl import ttlservice
from ttl import ttlparser
import nltk
import storage
data = storage.parseIndex("data")
print(data)
#textXml = ttlservice.executeTtl(u"Numele meu este Tibi și îmi place să cânt la chitară bass. Ce faci?")
#words, chunks = ttlparser.parseText(textXml)
#print ("Words: ", words)
#print ("Chunks: ", chunks)

View File

View File

@ -0,0 +1,38 @@
import operator
import storage
def letterFrequencies(text):
letterfreq = [{}, {}, {}, {}]
lettersum = [0, 0, 0, 0]
n = len(text)
for i in range(n):
# compute substring frequency
# l = substring length
for l in range(1, 4):
sub = text[i : i + l].lower()
if len(sub) == l and sub.isalnum():
lettersum[l] += 1
if not sub in letterfreq[l]:
letterfreq[l][sub] = 1
else:
letterfreq[l][sub] += 1
# compute punctuation frequency
chr = text[i]
if not chr.isalnum() and not chr.isspace() and chr.isprintable():
lettersum[0] += 1
if not chr in letterfreq[0]:
letterfreq[0][chr] = 1
else:
letterfreq[0][chr] += 1
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
for i in range(4):
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
freqFiltered = freqSorted[0:50]
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
letterfreq[i] = freqNormalized
return letterfreq

View File

@ -0,0 +1,2 @@
def analyzeWords(text):
pass

View File

@ -0,0 +1,128 @@
import urllib
from pyquery import PyQuery
import sqlite3
import re
BASE_URL = "https://ro.wikisource.org"
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def getAuthorList():
authors = []
for letter in LETTERS:
print("Processing link page for letter", letter)
# Read index page
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
data = urllib.request.urlopen(url).read()
q = PyQuery(data)
for item in q("div.mw-category-generated").find("a"):
if (item.text.startswith("Autor:")):
authorname = item.text[6:]
authorlink = item.attrib['href']
authors.append((authorname, authorlink))
return list(set(authors))
def getAuthorWikiLink(query):
wikilink = None
body = query("div#mw-content-text")
table = body.find("table")
for link in table.find("a"):
if "ro.wikipedia.org" in link.attrib['href']:
wikilink = link.attrib['href']
return wikilink
def getAuthorLinksList(authorname, query):
links = []
body = query("div#mw-content-text")
for link in body.find("a"):
address = link.attrib['href']
ok = True
if "http" in address:
ok = False
if "redlink" in address:
ok = False
if "Fi%C8%99ier:" in address:
ok = False
if "index.php" in address:
ok = False
if address.startswith("#"):
ok = False
if "Autor:" in address:
ok = False
if ok:
links.append(link.attrib['href'])
return links
def getAuthorBasicInfo(authorname, authorlink):
info = {}
data = urllib.request.urlopen(BASE_URL + authorlink).read()
q = PyQuery(data)
info["wiki"] = getAuthorWikiLink(q)
info["links"] = getAuthorLinksList(authorname, q)
return info
# def getAuthorWikiInfo(authorinfo):
# # Nothing can be learned without wiki page
# if authorinfo["wiki"] is None:
# return authorinfo
# try:
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
# q = PyQuery(data)
# # Find the birth date
# body = q("#mw-content-text").text()
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
# if not result is None:
# authorinfo["birthyear"] = result.group(0)
# except urllib.error.HTTPError:
# pass
# return authorinfo
def getText(url):
data = urllib.request.urlopen(BASE_URL + url).read()
q = PyQuery(data)
texttitle = q("h1").text()
body = q("#mw-content-text")
body.find("table").remove()
textcontent = body.text()
return (texttitle, textcontent)
def addAuthorToDb(authorinfo):
con = sqlite3.connect("data/texts.db")
c = con.cursor()
c.execute("INSERT INTO Authors")
def getAllTexts():
con = sqlite3.connect("data/texts.db")
c = con.cursor()
#c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
id = 1
authors = getAuthorList()
for authorname, authorlink in authors:
print("Processing author", authorname)
authorinfo = getAuthorBasicInfo(authorname, authorlink)
c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
# authorinfo = getAuthorWikiInfo(authorinfo)
for text in authorinfo["links"]:
try:
title, content = getText(text)
c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
id = id + 1
except urllib.error.HTTPError:
continue
con.commit()
getAllTexts()

0
src/ttl/__init__.py Normal file
View File

62
src/ttl/ttlparser.py Normal file
View File

@ -0,0 +1,62 @@
'''
Created on May 22, 2016
@author: tibi
'''
from xml.dom import minidom;
from xml.parsers.expat import ExpatError
from model.Word import Word
def parseText(xmlText):
words = []
chunks = {}
sentence_i = 0
# get the root "segs" element
try:
dom = minidom.parseString(xmlText)
except ExpatError as e:
print("Error in text:", xmlText)
print(e)
exit(-1)
alltext = dom.getElementsByTagName("segs")
# iterate paragraphs
for paragraph in alltext[0].getElementsByTagName("seg"):
# iterate sentences
for sentence in paragraph.getElementsByTagName("s"):
# increment sentence index
sentence_i += 1
word_i = 0
# iterate words
for word in sentence.getElementsByTagName("w"):
# increment word index
word_i += 1
# obtain word info
wordText = word.firstChild.data
lemma = word.getAttribute("lemma")
ana = word.getAttribute("ana")
chunk = word.getAttribute("chunk")
# create word
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
#words.append(w)
for c in chunk.split(","):
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
words.append(w)
if chunks.get((sentence_i, c)) == None:
chunks[(sentence_i, c)] = [ w ]
else:
chunks[(sentence_i, c)].append(w)
return (words, chunks)

34
src/ttl/ttlservice.py Normal file
View File

@ -0,0 +1,34 @@
# coding: utf-8
import zeep
def executeTtl(text):
# Preprocess the text
text = text.replace(u'ĭ', 'i')
text = text.replace(u'ŭ', 'u')
text = text.replace(u'à', 'a')
client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
textSgml = client.service.UTF8toSGML(text)
result = client.service.XCES("ro", "id", textSgml)
# Cleanup result - generate valid xml
result = result.replace('&rsquor;', '`')
result = result.replace('&abreve;', u'ă')
result = result.replace('&agrave;', u'à')
result = result.replace('&acirc;', u'â')
result = result.replace('&icirc;', u'î')
result = result.replace('&scedil;', u'ș')
result = result.replace('&tcedil;', u'ț')
result = result.replace('&ubreve;', u'u')
result = result.replace('&Abreve;', u'Ă')
result = result.replace('&Agrave;', u'À')
result = result.replace('&Acirc;', u'Â')
result = result.replace('&Icirc;', u'Î')
result = result.replace('&Scedil;', u'Ș')
result = result.replace('&Tcedil;', u'Ț')
result = result.replace('&Ubreve;', u'U')
xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
xmlResult += result
xmlResult += "</segs>"
return xmlResult