Initial commit

This commit is contained in:
2017-05-23 13:57:53 +03:00
commit 6badfbd103
38 changed files with 1286 additions and 0 deletions
+107
View File
@@ -0,0 +1,107 @@
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
logs/*
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
+220
View File
@@ -0,0 +1,220 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Run without debugging",
"type": "python",
"request": "launch",
"stopOnEntry": false,
"pythonPath": "${config:python.pythonPath}",
"program": "${file}",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Python",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${file}",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "PySpark",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"osx": {
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
},
"windows": {
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd"
},
"linux": {
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
},
"program": "${file}",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Python Module",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"module": "module.name",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Integrated Terminal/Console",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${file}",
"cwd": "",
"console": "integratedTerminal",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit"
]
},
{
"name": "External Terminal/Console",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${file}",
"cwd": "",
"console": "externalTerminal",
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit"
]
},
{
"name": "Django",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${workspaceRoot}/manage.py",
"cwd": "${workspaceRoot}",
"args": [
"runserver",
"--noreload"
],
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput",
"DjangoDebugging"
]
},
{
"name": "Flask",
"type": "python",
"request": "launch",
"stopOnEntry": false,
"pythonPath": "${config:python.pythonPath}",
"program": "fully qualified path fo 'flask' executable. Generally located along with python interpreter",
"cwd": "${workspaceRoot}",
"env": {
"FLASK_APP": "${workspaceRoot}/quickstart/app.py"
},
"args": [
"run",
"--no-debugger",
"--no-reload"
],
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Flask (old)",
"type": "python",
"request": "launch",
"stopOnEntry": false,
"pythonPath": "${config:python.pythonPath}",
"program": "${workspaceRoot}/run.py",
"cwd": "${workspaceRoot}",
"args": [],
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Pyramid",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"cwd": "${workspaceRoot}",
"env": {},
"envFile": "${workspaceRoot}/.env",
"args": [
"${workspaceRoot}/development.ini"
],
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput",
"Pyramid"
]
},
{
"name": "Watson",
"type": "python",
"request": "launch",
"stopOnEntry": true,
"pythonPath": "${config:python.pythonPath}",
"program": "${workspaceRoot}/console.py",
"cwd": "${workspaceRoot}",
"args": [
"dev",
"runserver",
"--noreload=True"
],
"env": {},
"envFile": "${workspaceRoot}/.env",
"debugOptions": [
"WaitOnAbnormalExit",
"WaitOnNormalExit",
"RedirectOutput"
]
},
{
"name": "Attach (Remote Debug)",
"type": "python",
"request": "attach",
"localRoot": "${workspaceRoot}",
"remoteRoot": "${workspaceRoot}",
"port": 3000,
"secret": "my_secret",
"host": "localhost"
}
]
}
+3
View File
@@ -0,0 +1,3 @@
// Place your settings in this file to overwrite default and user settings.
{
}
BIN
View File
Binary file not shown.
+17
View File
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>NLP</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.python.pydev.pythonNature</nature>
</natures>
</projectDescription>
+8
View File
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?eclipse-pydev version="1.0"?><pydev_project>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/${PROJECT_DIR_NAME}/src</path>
</pydev_pathproperty>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 3.0</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 3.5</pydev_property>
</pydev_project>
+1
View File
@@ -0,0 +1 @@
Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase.
+6
View File
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<segs>
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Maria" ana="Np" chunk="Np#1">Maria</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="măr" ana="Ncfp-n" chunk="Np#2">mere</w><c>.</c></s></seg>
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="el" ana="Pp3fsr--------s">Ea</w><w lemma="mai" ana="Rp" chunk="Ap#1">mai</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="șapte" ana="Mc-p-l" chunk="Np#1">șapte</w><w lemma="pară" ana="Ncfp-n" chunk="Np#1">pere</w><c>.</c></s></seg>
<seg lang="ro"><s id="id_temp_aiurea.3"><w lemma="acesta" ana="Pd3fpr">Acestea</w><w lemma="fi" ana="Vmip3p" chunk="Vp#1">sunt</w><w lemma="foarte" ana="Rp" chunk="Ap#1,Vp#1">foarte</w><w lemma="delicios" ana="Afpfp-n" chunk="Ap#1">delicioase</w><c>.</c></s></seg>
</segs>
+1
View File
@@ -0,0 +1 @@
Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat.
+5
View File
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<segs>
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sabeer" ana="Np" chunk="Np#1">Sabeer</w><w lemma="Bhatia" ana="Np" chunk="Np#1">Bhatia</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="ajunge" ana="Vmp--sm" chunk="Vp#1">ajuns</w><w lemma="la" ana="Spsa" chunk="Pp#1">la</w><w lemma="aeroport" ana="Ncmsry" chunk="Pp#1,Np#2">Aeroportul</w><w lemma="internațional" ana="Afpms-n" chunk="Pp#1,Np#2,Ap#1">Internațional</w><w lemma="din" ana="Spsa" chunk="Pp#2">din</w><w lemma="Los" ana="Np" chunk="Pp#2,Np#3">Los</w><w lemma="Angeles" ana="Np" chunk="Pp#2,Np#3">Angeles</w><w lemma="la" ana="Spsa" chunk="Pp#3">la</w><w lemma="oră" ana="Ncfsry" chunk="Pp#3,Np#4">ora</w><w lemma="18" ana="Mc" chunk="Pp#3,Np#4">18</w><w lemma="în" ana="Spsa" chunk="Pp#4">în</w><w lemma="dată" ana="Ncfsry" chunk="Pp#4,Np#5">data</w><w lemma="de" ana="Spsa" chunk="Pp#5">de</w><w lemma="23" ana="Mc" chunk="Pp#5,Np#6">23</w><w lemma="septembrie" ana="Ncms-n" chunk="Pp#5,Np#6">septembrie</w><w lemma="1998" ana="Mc" chunk="Pp#5,Np#6">1998</w><c>.</c></s></seg>
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="zbor" ana="Ncmsry" chunk="Np#1">Zborul</w><w lemma="său" ana="Ds3ms-s" chunk="Np#1">său</w><w lemma="din" ana="Spsa" chunk="Pp#1">din</w><w lemma="Bangalore" ana="Np" chunk="Pp#1,Np#2">Bangalore</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dura" ana="Vmp--sm" chunk="Vp#1">durat</w><w lemma="22" ana="Mc" chunk="Np#3">22</w><w lemma="oră" ana="Ncfp-n" chunk="Np#3">ore</w><c>,</c><w lemma="și" ana="Crssp">și</w><w lemma="el" ana="Pp3msr--------s" chunk="Vp#2">el</w><w lemma="fi" ana="Vaii3s" chunk="Vp#2">era</w><w lemma="înfometa" ana="Vmp--sm" chunk="Vp#2">înfometat</w><c>.</c></s></seg>
</segs>
+1
View File
@@ -0,0 +1 @@
Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul.
+5
View File
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8" ?>
<segs>
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sophia" ana="Np" chunk="Np#1">Sophia</w><w lemma="Loren" ana="Np" chunk="Np#1">Loren</w><w lemma="spune" ana="Vmnp" chunk="Vp#1">spune</w><w lemma="că" ana="Csssp"></w><w lemma="el" ana="Pp3fsr--------s" chunk="Vp#2">ea</w><w lemma="vrea" ana="Va--3s" chunk="Vp#2">va</w><w lemma="fi" ana="Vmnp" chunk="Vp#2">fi</w><w lemma="întotdeauna" ana="Rgp" chunk="Vp#2,Ap#1">întotdeauna</w><w lemma="mulțumitor" ana="Afpf--n" chunk="Ap#1">mulțumitoare</w><w lemma="față_de" ana="Spca" chunk="Pp#1">față_de</w><w lemma="bonă" ana="Ncfsvy" chunk="Pp#1,Np#2">Bono</w><c>.</c></s></seg>
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="actriță" ana="Ncfsry" chunk="Np#1">Actrița</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dezvălui" ana="Vmp--sm" chunk="Vp#1">dezvăluit</w><w lemma="că" ana="Csssp"></w><w lemma="cântăreț" ana="Ncmsry" chunk="Np#2">cântărețul</w><w lemma="trupă" ana="Ncfsoy" chunk="Np#2">trupei</w><w lemma="U2" ana="Np" chunk="Np#2">U2</w><w lemma="avea" ana="Va--3s" chunk="Vp#2">a</w><w lemma="ajuta" ana="Vmp--sm" chunk="Vp#2">ajutat</w><w lemma="el" ana="Pp3fsa--y-----w">-o</w><w lemma="să" ana="Qs" chunk="Vp#3"></w><w lemma="sine" ana="Px3--a--------w" chunk="Vp#3">se</w><w lemma="liniști" ana="Vmsp3" chunk="Vp#3">liniștească</w><w lemma="atunci_când" ana="Rw" chunk="Vp#3,Ap#1">atunci_când</w><w lemma="el" ana="Pp3fsr--------s">ea</w><w lemma="sine" ana="Px3--a--y-----w" chunk="Vp#4">s-</w><w lemma="avea" ana="Va--3s" chunk="Vp#4">a</w><w lemma="speria" ana="Vmp--sm" chunk="Vp#4">speriat</w><w lemma="de" ana="Spsa" chunk="Pp#1">de</w><w lemma="un" ana="Tifsr" chunk="Pp#1,Np#3">o</w><w lemma="furtună" ana="Ncfsrn" chunk="Pp#1,Np#3">furtună</w><w lemma="în_timp_ce" ana="Cscsp">în_timp_ce</w><w lemma="zbura" ana="Vmii3p" chunk="Vp#5">zburau</w><w lemma="cu" ana="Spsa" chunk="Pp#2">cu</w><w lemma="avion" ana="Ncmsry" chunk="Pp#2,Np#4">avionul</w><c>.</c></s></seg>
</segs>
+121
View File
@@ -0,0 +1,121 @@
'''
Created on May 22, 2016
@author: tibi
'''
from model import Word
def getGender(word):
if word.isPronoun() and (word.pronounGetPerson() == '1' or word.pronounGetPerson() == '2'):
return 'n'
return word.getGender()
def genderMatch(word1, word2):
g1 = getGender(word1)
g2 = getGender(word2)
if g1 == g2:
return 2
if g1 == 'n' or g2 == 'n':
return 1
return 0
def isPrepositional(chunk):
for word in chunk:
if word.isPreposition():
return True
return False
def countInText(noun, text):
c = 0
for word in text:
if word.text == noun.text:
c += 1
return c
def anaphora(text, chunks):
nounPhrases = []
for word in text:
if word.isNoun():
print("[n]", word)
nounPhrases.append((word, (word.sentenceIndex, word.chunk)))
else:
print(word)
if word.isPronoun():
candidates = []
for noun, chunkIndex in nounPhrases[:-30:-1]:
# If gender and number match
if genderMatch(word, noun) > 0 and word.getNumber() == noun.getNumber():
npInd = genderMatch(word, noun)
# definiteness
if not noun.nounIsDefinite():
npInd -= 1
# non-prepositional noun phrase
chunk = chunks[chunkIndex]
if (isPrepositional(chunk)):
npInd -= 1
# first in sentence
if noun.sentenceIndex == 1:
npInd += 1
# indicating verbs
# todo...
# lexical reiteration
c = countInText(noun, text)
if c == 2:
npInd += 1
if c > 2:
npInd += 2
# noun is representing term
# how?
# identical collocation pattern to the pronoun
# ???
# immediate reference, resolving 'it'
# applicable?
# referential distance
dist = word.sentenceIndex - noun.sentenceIndex
if dist == 0:
npInd += 2
elif dist == 1:
npInd += 1
candidates.append((noun, npInd))
print("...> Candidate: {0} npInd = {1}".format(noun, npInd))
if len(candidates) > 0:
pickedWord, pickedInd = candidates[0]
for word, npInd in candidates:
if npInd > pickedInd:
pickedInd = npInd
pickedWord = word
print(".>>> Picked: {0}".format(pickedWord))
+55
View File
@@ -0,0 +1,55 @@
'''
Created on May 22, 2016
@author: tibi
'''
from xml.dom import minidom;
from model.Word import Word
def parse(filename):
words = []
chunks = {}
sentence_i = 0
# get the root "segs" element
dom = minidom.parse(filename)
alltext = dom.getElementsByTagName("segs")
# iterate paragraphs
for paragraph in alltext[0].getElementsByTagName("seg"):
# iterate sentences
for sentence in paragraph.getElementsByTagName("s"):
# increment sentence index
sentence_i += 1
word_i = 0
# iterate words
for word in sentence.getElementsByTagName("w"):
# increment word index
word_i += 1
# obtain word info
wordText = word.firstChild.data
lemma = word.getAttribute("lemma")
ana = word.getAttribute("ana")
chunk = word.getAttribute("chunk")
# create word
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
#words.append(w)
for c in chunk.split(","):
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
words.append(w)
if chunks.get((sentence_i, c)) == None:
chunks[(sentence_i, c)] = [ w ]
else:
chunks[(sentence_i, c)].append(w)
return (words, chunks)
+26
View File
@@ -0,0 +1,26 @@
'''
Created on May 22, 2016
@author: tibi
'''
import fileparser
from anaphora import anaphora
if __name__ == '__main__':
words, chunks = fileparser.parse("../data/text3_processed.xml")
print("Words:")
for word in words:
print("[{0} {1}] {2}".format(word.sentenceIndex, word.wordIndex, word))
print("")
print("Chunks:")
for key, value in chunks.items():
print(key, ":")
for word in value:
print(" - ", word)
print("")
print("Anaphora resolution:")
anaphora(words, chunks)
+88
View File
@@ -0,0 +1,88 @@
'''
Created on May 22, 2016
@author: tibi
'''
class Word:
text = ""
lemma = ""
ana = ""
chunk = ""
sentenceIndex = 0
wordIndex = 0
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
self.text = text
self.lemma = lemma
self.ana = ana
self.chunk = chunk
self.sentenceIndex = sentenceIndex
self.wordIndex = wordIndex
def __str__(self):
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
def isNoun(self):
return self.ana[0] == "N"
def nounIsCommon(self):
return self.isNoun() and self.ana[1] == "c"
def nounIsProper(self):
return self.isNoun() and self.ana[1] == "p"
def nounGetCase(self):
if self.isNoun():
return self.ana[4]
return None
'Este articulat?'
def nounIsDefinite(self):
if self.isNoun():
if (self.nounIsProper()):
return True
if len(self.ana) > 5:
return self.ana[5]
return "n"
def pronounGetPerson(self):
if self.isPronoun():
return self.ana[2]
def getGender(self):
if self.isNoun():
if (len(self.ana) >= 3):
return self.ana[2]
return 'n'
if self.isPronoun():
return self.ana[3]
return None
def getNumber(self):
if self.isNoun():
if self.nounIsProper():
return 's'
else:
return self.ana[3]
if self.isPronoun():
return self.ana[4]
return None
def isPronoun(self):
return self.ana[0] == "P"
def isVerb(self):
return self.ana[0] == "V"
def isPreposition(self):
return self.ana[0] == "S" and self.ana[1] == "p"
View File
+1
View File
@@ -0,0 +1 @@
books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania
1 books/Moara cu noroc - Ioan Slavici.epub Moara cu noroc 1880 Ioan Slavici 1848 Transilvania
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
+20
View File
@@ -0,0 +1,20 @@
import time
import logging
def init_logger(level):
# Log filename
tm = time.strftime('%Y-%m-%d_%H-%M-%S')
logFile = "logs/log_{0}.log".format(tm)
# Set up file logger
logging.basicConfig(filename=logFile,
level=logging.DEBUG,
format='%(asctime)s %(name)s %(levelname)s %(message)s',
datefmt='%m-%d %H:%M')
# Set up console logger
formatter = logging.Formatter('[%(name)s] %(levelname)s: %(message)s')
console = logging.StreamHandler()
console.setLevel(level)
console.setFormatter(formatter)
logging.getLogger().addHandler(console)
+40
View File
@@ -0,0 +1,40 @@
import logging
import time
# own
import logger
import storage.data
import storage.results
import textprocessor.letterfreq
import ttl.ttlparser
import ttl.ttlservice
def init():
logger.init_logger(logging.WARNING)
storage.data.initializeFragmentDatabase("data/texts.db")
storage.results.initializeResultsDatabase("data/results.db", True)
def processTexts():
count = storage.data.getTextCount()
current = 0
for item in storage.data.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# obtain ttl analysis
# unfeasable - it takes 5-10 minutes for a single text
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
# storage.results.storeTtlAnalysis(itemid, words)
# perform analysis
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
storage.results.storeFrequencies(itemid, letterFreq)
print("Finished!")
init()
processTexts()
+32
View File
@@ -0,0 +1,32 @@
# Defines a fragment author
class Author:
def __init__(self, name = "", birthYear = "", location = "Romania"):
self.name = name
self.yearOfBirth = birthYear
self.location = location
def __str__(self):
return self.name
def __repr__(self):
return self.name
def dump(self):
return "[Author name={0} yearOfBirth={1} location={2}]".format(self.name, self.yearOfBirth, self.location)
# Defines a text fragment
class Fragment:
def __init__(self, title = "", text = "", author = Author(), year = 1999):
self.title = title
self.text = text
self.author = author
self.year = year
def __str__(self):
return self.title
def __repr__(self):
return self.title
def dump(self):
return "[Fragment title={0} author={1} year={2} text={3}]".format(self.title, self.author.dump(), self.year, self.text)
+87
View File
@@ -0,0 +1,87 @@
# Defines a processed word
class Word:
text = ""
lemma = ""
ana = ""
chunk = ""
sentenceIndex = 0
wordIndex = 0
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
self.text = text
self.lemma = lemma
self.ana = ana
self.chunk = chunk
self.sentenceIndex = sentenceIndex
self.wordIndex = wordIndex
def __str__(self):
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
def __repr__(self):
return str(self)
def isNoun(self):
return self.ana[0] == "N"
def nounIsCommon(self):
return self.isNoun() and self.ana[1] == "c"
def nounIsProper(self):
return self.isNoun() and self.ana[1] == "p"
def nounGetCase(self):
if self.isNoun():
return self.ana[4]
return None
'Este articulat?'
def nounIsDefinite(self):
if self.isNoun():
if (self.nounIsProper()):
return True
if len(self.ana) > 5:
return self.ana[5]
return "n"
def pronounGetPerson(self):
if self.isPronoun():
return self.ana[2]
def getGender(self):
if self.isNoun():
if (len(self.ana) >= 3):
return self.ana[2]
return 'n'
if self.isPronoun():
return self.ana[3]
return None
def getNumber(self):
if self.isNoun():
if self.nounIsProper():
return 's'
else:
return self.ana[3]
if self.isPronoun():
return self.ana[4]
return None
def isPronoun(self):
return self.ana[0] == "P"
def isVerb(self):
return self.ana[0] == "V"
def isPreposition(self):
return self.ana[0] == "S" and self.ana[1] == "p"
View File
View File
+80
View File
@@ -0,0 +1,80 @@
import logging
import os
from model import *
import sqlite3
log = logging.getLogger("storage")
DB_FRAGMENTS = ""
# Commands
# birth location - general area, not exact location (i.e. Transylvania)
# birth origin - rural or urban
# studies - masters, bachelors, high school, middle school, primary school
# occupation - comma separated if there are multiple
# studiesAbroad - foreign cities where author studied (comma separated)
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
name TEXT PRIMARY KEY,
birthYear INTEGER,
birthLocation TEXT,
birthOrigin TEXT,
studies TEXT,
occupations TEXT,
studiesAbroad TEXT
)"""
# genre - short story (nuvela), novel (roman), poem etc
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
# tags - other relevant information (i.e. psychological)
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
id INTEGER PRIMARY KEY,
title TEXT,
year INTEGER,
author TEXT REFERENCES Authors(name),
genre TEXT,
movement TEXT,
tags TEXT
)"""
# contains the actual text
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
id INTEGER REFERENCES Fragments(id),
content TEXT
)"""
# Initialize databases
def initializeFragmentDatabase(dbFile):
global DB_FRAGMENTS
DB_FRAGMENTS = dbFile
if not os.path.exists(dbFile):
log.info("Text database %s not found. Will create database.", dbFile)
con = sqlite3.connect(dbFile)
c = con.cursor()
c.execute(COMMAND_CREATE_AUTHORS)
c.execute(COMMAND_CREATE_FRAGMENTS)
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
con.commit()
con.close()
log.info("Database created!")
def getTextCount():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT COUNT(*) FROM Fragments")
item = c.fetchone()
c.close()
con.close()
return item[0]
def getAllTexts():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT id, content FROM FragmentsContent")
items = c.fetchall()
c.close()
con.close()
return items
+84
View File
@@ -0,0 +1,84 @@
import logging
import os
from model.Word import *
import sqlite3
log = logging.getLogger("storage")
DB_RESULTS = ""
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
idtext INTEGER,
lettergroup TEXT,
category TEXT,
frequency REAL
)"""
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
idtext INTEGER,
wordIndex INTEGER,
sentenceIndex INTEGER,
word TEXT,
lemma TEXT,
analysis TEXT,
chunk TEXT
)"""
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
# idtext INTEGER,
# wordlength INTEGER,
# frequency REAL
# )"""
def initializeResultsDatabase(dbFile, cleanupOldData):
global DB_RESULTS
DB_RESULTS = dbFile
# cleanup old data
if cleanupOldData:
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
try:
c.execute("DROP TABLE LetterFrequencies")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
try:
c.execute("DROP TABLE TextWords")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_TEXT_WORDS)
con.commit()
c.close()
con.close()
def storeFrequencies(idtext, freq):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# add data
chr = ['p', 'l1', 'l2', 'l3']
for i in range(4):
for let, fr in freq[i]:
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
con.commit()
c.close()
con.close()
def storeTtlAnalysis(idtext, words):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# store words
for word in words:
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
# finish
con.commit()
c.close()
con.close()
+14
View File
@@ -0,0 +1,14 @@
# coding: utf-8
from ttl import ttlservice
from ttl import ttlparser
import nltk
import storage
data = storage.parseIndex("data")
print(data)
#textXml = ttlservice.executeTtl(u"Numele meu este Tibi și îmi place să cânt la chitară bass. Ce faci?")
#words, chunks = ttlparser.parseText(textXml)
#print ("Words: ", words)
#print ("Chunks: ", chunks)
View File
+38
View File
@@ -0,0 +1,38 @@
import operator
import storage
def letterFrequencies(text):
letterfreq = [{}, {}, {}, {}]
lettersum = [0, 0, 0, 0]
n = len(text)
for i in range(n):
# compute substring frequency
# l = substring length
for l in range(1, 4):
sub = text[i : i + l].lower()
if len(sub) == l and sub.isalnum():
lettersum[l] += 1
if not sub in letterfreq[l]:
letterfreq[l][sub] = 1
else:
letterfreq[l][sub] += 1
# compute punctuation frequency
chr = text[i]
if not chr.isalnum() and not chr.isspace() and chr.isprintable():
lettersum[0] += 1
if not chr in letterfreq[0]:
letterfreq[0][chr] = 1
else:
letterfreq[0][chr] += 1
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
for i in range(4):
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
freqFiltered = freqSorted[0:50]
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
letterfreq[i] = freqNormalized
return letterfreq
+2
View File
@@ -0,0 +1,2 @@
def analyzeWords(text):
pass
+128
View File
@@ -0,0 +1,128 @@
import urllib
from pyquery import PyQuery
import sqlite3
import re
BASE_URL = "https://ro.wikisource.org"
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def getAuthorList():
authors = []
for letter in LETTERS:
print("Processing link page for letter", letter)
# Read index page
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
data = urllib.request.urlopen(url).read()
q = PyQuery(data)
for item in q("div.mw-category-generated").find("a"):
if (item.text.startswith("Autor:")):
authorname = item.text[6:]
authorlink = item.attrib['href']
authors.append((authorname, authorlink))
return list(set(authors))
def getAuthorWikiLink(query):
wikilink = None
body = query("div#mw-content-text")
table = body.find("table")
for link in table.find("a"):
if "ro.wikipedia.org" in link.attrib['href']:
wikilink = link.attrib['href']
return wikilink
def getAuthorLinksList(authorname, query):
links = []
body = query("div#mw-content-text")
for link in body.find("a"):
address = link.attrib['href']
ok = True
if "http" in address:
ok = False
if "redlink" in address:
ok = False
if "Fi%C8%99ier:" in address:
ok = False
if "index.php" in address:
ok = False
if address.startswith("#"):
ok = False
if "Autor:" in address:
ok = False
if ok:
links.append(link.attrib['href'])
return links
def getAuthorBasicInfo(authorname, authorlink):
info = {}
data = urllib.request.urlopen(BASE_URL + authorlink).read()
q = PyQuery(data)
info["wiki"] = getAuthorWikiLink(q)
info["links"] = getAuthorLinksList(authorname, q)
return info
# def getAuthorWikiInfo(authorinfo):
# # Nothing can be learned without wiki page
# if authorinfo["wiki"] is None:
# return authorinfo
# try:
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
# q = PyQuery(data)
# # Find the birth date
# body = q("#mw-content-text").text()
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
# if not result is None:
# authorinfo["birthyear"] = result.group(0)
# except urllib.error.HTTPError:
# pass
# return authorinfo
def getText(url):
data = urllib.request.urlopen(BASE_URL + url).read()
q = PyQuery(data)
texttitle = q("h1").text()
body = q("#mw-content-text")
body.find("table").remove()
textcontent = body.text()
return (texttitle, textcontent)
def addAuthorToDb(authorinfo):
con = sqlite3.connect("data/texts.db")
c = con.cursor()
c.execute("INSERT INTO Authors")
def getAllTexts():
con = sqlite3.connect("data/texts.db")
c = con.cursor()
#c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
id = 1
authors = getAuthorList()
for authorname, authorlink in authors:
print("Processing author", authorname)
authorinfo = getAuthorBasicInfo(authorname, authorlink)
c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
# authorinfo = getAuthorWikiInfo(authorinfo)
for text in authorinfo["links"]:
try:
title, content = getText(text)
c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
id = id + 1
except urllib.error.HTTPError:
continue
con.commit()
getAllTexts()
View File
+62
View File
@@ -0,0 +1,62 @@
'''
Created on May 22, 2016
@author: tibi
'''
from xml.dom import minidom;
from xml.parsers.expat import ExpatError
from model.Word import Word
def parseText(xmlText):
words = []
chunks = {}
sentence_i = 0
# get the root "segs" element
try:
dom = minidom.parseString(xmlText)
except ExpatError as e:
print("Error in text:", xmlText)
print(e)
exit(-1)
alltext = dom.getElementsByTagName("segs")
# iterate paragraphs
for paragraph in alltext[0].getElementsByTagName("seg"):
# iterate sentences
for sentence in paragraph.getElementsByTagName("s"):
# increment sentence index
sentence_i += 1
word_i = 0
# iterate words
for word in sentence.getElementsByTagName("w"):
# increment word index
word_i += 1
# obtain word info
wordText = word.firstChild.data
lemma = word.getAttribute("lemma")
ana = word.getAttribute("ana")
chunk = word.getAttribute("chunk")
# create word
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
#words.append(w)
for c in chunk.split(","):
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
words.append(w)
if chunks.get((sentence_i, c)) == None:
chunks[(sentence_i, c)] = [ w ]
else:
chunks[(sentence_i, c)].append(w)
return (words, chunks)
+34
View File
@@ -0,0 +1,34 @@
# coding: utf-8
import zeep
def executeTtl(text):
# Preprocess the text
text = text.replace(u'ĭ', 'i')
text = text.replace(u'ŭ', 'u')
text = text.replace(u'à', 'a')
client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
textSgml = client.service.UTF8toSGML(text)
result = client.service.XCES("ro", "id", textSgml)
# Cleanup result - generate valid xml
result = result.replace('&rsquor;', '`')
result = result.replace('&abreve;', u'ă')
result = result.replace('&agrave;', u'à')
result = result.replace('&acirc;', u'â')
result = result.replace('&icirc;', u'î')
result = result.replace('&scedil;', u'ș')
result = result.replace('&tcedil;', u'ț')
result = result.replace('&ubreve;', u'u')
result = result.replace('&Abreve;', u'Ă')
result = result.replace('&Agrave;', u'À')
result = result.replace('&Acirc;', u'Â')
result = result.replace('&Icirc;', u'Î')
result = result.replace('&Scedil;', u'Ș')
result = result.replace('&Tcedil;', u'Ț')
result = result.replace('&Ubreve;', u'U')
xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
xmlResult += result
xmlResult += "</segs>"
return xmlResult