Initial commit

2017-05-23 13:57:53 +03:00
commit 6badfbd103
38 changed files with 1286 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,107 @@
 .vscode/*
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
 !.vscode/extensions.json
 logs/*
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,220 @@
 {
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Run without debugging",
            "type": "python",
            "request": "launch",
            "stopOnEntry": false,
            "pythonPath": "${config:python.pythonPath}",
            "program": "${file}",
            "cwd": "${workspaceRoot}",
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput"
            ]
        },
        {
            "name": "Python",
            "type": "python",
            "request": "launch",
            "stopOnEntry": true,
            "pythonPath": "${config:python.pythonPath}",
            "program": "${file}",
            "cwd": "${workspaceRoot}",
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput"
            ]
        },
        {
            "name": "PySpark",
            "type": "python",
            "request": "launch",
            "stopOnEntry": true,
            "osx": {
                "pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
            },
            "windows": {
                "pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd"
            },
            "linux": {
                "pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
            },
            "program": "${file}",
            "cwd": "${workspaceRoot}",
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput"
            ]
        },
        {
            "name": "Python Module",
            "type": "python",
            "request": "launch",
            "stopOnEntry": true,
            "pythonPath": "${config:python.pythonPath}",
            "module": "module.name",
            "cwd": "${workspaceRoot}",
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput"
            ]
        },
        {
            "name": "Integrated Terminal/Console",
            "type": "python",
            "request": "launch",
            "stopOnEntry": true,
            "pythonPath": "${config:python.pythonPath}",
            "program": "${file}",
            "cwd": "",
            "console": "integratedTerminal",
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit"
            ]
        },
        {
            "name": "External Terminal/Console",
            "type": "python",
            "request": "launch",
            "stopOnEntry": true,
            "pythonPath": "${config:python.pythonPath}",
            "program": "${file}",
            "cwd": "",
            "console": "externalTerminal",
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit"
            ]
        },
        {
            "name": "Django",
            "type": "python",
            "request": "launch",
            "stopOnEntry": true,
            "pythonPath": "${config:python.pythonPath}",
            "program": "${workspaceRoot}/manage.py",
            "cwd": "${workspaceRoot}",
            "args": [
                "runserver",
                "--noreload"
            ],
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput",
                "DjangoDebugging"
            ]
        },
        {
            "name": "Flask",
            "type": "python",
            "request": "launch",
            "stopOnEntry": false,
            "pythonPath": "${config:python.pythonPath}",
            "program": "fully qualified path fo 'flask' executable. Generally located along with python interpreter",
            "cwd": "${workspaceRoot}",
            "env": {
                "FLASK_APP": "${workspaceRoot}/quickstart/app.py"
            },
            "args": [
                "run",
                "--no-debugger",
                "--no-reload"
            ],
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput"
            ]
        },
        {
            "name": "Flask (old)",
            "type": "python",
            "request": "launch",
            "stopOnEntry": false,
            "pythonPath": "${config:python.pythonPath}",
            "program": "${workspaceRoot}/run.py",
            "cwd": "${workspaceRoot}",
            "args": [],
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput"
            ]
        },
        {
            "name": "Pyramid",
            "type": "python",
            "request": "launch",
            "stopOnEntry": true,
            "pythonPath": "${config:python.pythonPath}",
            "cwd": "${workspaceRoot}",
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "args": [
                "${workspaceRoot}/development.ini"
            ],
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput",
                "Pyramid"
            ]
        },
        {
            "name": "Watson",
            "type": "python",
            "request": "launch",
            "stopOnEntry": true,
            "pythonPath": "${config:python.pythonPath}",
            "program": "${workspaceRoot}/console.py",
            "cwd": "${workspaceRoot}",
            "args": [
                "dev",
                "runserver",
                "--noreload=True"
            ],
            "env": {},
            "envFile": "${workspaceRoot}/.env",
            "debugOptions": [
                "WaitOnAbnormalExit",
                "WaitOnNormalExit",
                "RedirectOutput"
            ]
        },
        {
            "name": "Attach (Remote Debug)",
            "type": "python",
            "request": "attach",
            "localRoot": "${workspaceRoot}",
            "remoteRoot": "${workspaceRoot}",
            "port": 3000,
            "secret": "my_secret",
            "host": "localhost"
        }
    ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
 // Place your settings in this file to overwrite default and user settings.
 {
 }
--- a/NLP.zip
+++ b/NLP.zip
--- a/NLP/.project
+++ b/NLP/.project
@@ -0,0 +1,17 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <projectDescription>
 	<name>NLP</name>
 	<comment></comment>
 	<projects>
 	</projects>
 	<buildSpec>
 		<buildCommand>
 			<name>org.python.pydev.PyDevBuilder</name>
 			<arguments>
 			</arguments>
 		</buildCommand>
 	</buildSpec>
 	<natures>
 		<nature>org.python.pydev.pythonNature</nature>
 	</natures>
 </projectDescription>
--- a/NLP/.pydevproject
+++ b/NLP/.pydevproject
@@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <?eclipse-pydev version="1.0"?><pydev_project>
 <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
 <path>/${PROJECT_DIR_NAME}/src</path>
 </pydev_pathproperty>
 <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 3.0</pydev_property>
 <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 3.5</pydev_property>
 </pydev_project>
--- a/NLP/data/text1.txt
+++ b/NLP/data/text1.txt
@@ -0,0 +1 @@
 Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase.
--- a/NLP/data/text1_processed.xml
+++ b/NLP/data/text1_processed.xml
@@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <segs>
 <seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Maria" ana="Np" chunk="Np#1">Maria</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="măr" ana="Ncfp-n" chunk="Np#2">mere</w><c>.</c></s></seg>
 <seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="el" ana="Pp3fsr--------s">Ea</w><w lemma="mai" ana="Rp" chunk="Ap#1">mai</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="șapte" ana="Mc-p-l" chunk="Np#1">șapte</w><w lemma="pară" ana="Ncfp-n" chunk="Np#1">pere</w><c>.</c></s></seg>
 <seg lang="ro"><s id="id_temp_aiurea.3"><w lemma="acesta" ana="Pd3fpr">Acestea</w><w lemma="fi" ana="Vmip3p" chunk="Vp#1">sunt</w><w lemma="foarte" ana="Rp" chunk="Ap#1,Vp#1">foarte</w><w lemma="delicios" ana="Afpfp-n" chunk="Ap#1">delicioase</w><c>.</c></s></seg>
 </segs>
--- a/NLP/data/text2.txt
+++ b/NLP/data/text2.txt
@@ -0,0 +1 @@
 Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat.
--- a/NLP/data/text2_processed.xml
+++ b/NLP/data/text2_processed.xml
@@ -0,0 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
 <segs>
 <seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sabeer" ana="Np" chunk="Np#1">Sabeer</w><w lemma="Bhatia" ana="Np" chunk="Np#1">Bhatia</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="ajunge" ana="Vmp--sm" chunk="Vp#1">ajuns</w><w lemma="la" ana="Spsa" chunk="Pp#1">la</w><w lemma="aeroport" ana="Ncmsry" chunk="Pp#1,Np#2">Aeroportul</w><w lemma="internațional" ana="Afpms-n" chunk="Pp#1,Np#2,Ap#1">Internațional</w><w lemma="din" ana="Spsa" chunk="Pp#2">din</w><w lemma="Los" ana="Np" chunk="Pp#2,Np#3">Los</w><w lemma="Angeles" ana="Np" chunk="Pp#2,Np#3">Angeles</w><w lemma="la" ana="Spsa" chunk="Pp#3">la</w><w lemma="oră" ana="Ncfsry" chunk="Pp#3,Np#4">ora</w><w lemma="18" ana="Mc" chunk="Pp#3,Np#4">18</w><w lemma="în" ana="Spsa" chunk="Pp#4">în</w><w lemma="dată" ana="Ncfsry" chunk="Pp#4,Np#5">data</w><w lemma="de" ana="Spsa" chunk="Pp#5">de</w><w lemma="23" ana="Mc" chunk="Pp#5,Np#6">23</w><w lemma="septembrie" ana="Ncms-n" chunk="Pp#5,Np#6">septembrie</w><w lemma="1998" ana="Mc" chunk="Pp#5,Np#6">1998</w><c>.</c></s></seg>
 <seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="zbor" ana="Ncmsry" chunk="Np#1">Zborul</w><w lemma="său" ana="Ds3ms-s" chunk="Np#1">său</w><w lemma="din" ana="Spsa" chunk="Pp#1">din</w><w lemma="Bangalore" ana="Np" chunk="Pp#1,Np#2">Bangalore</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dura" ana="Vmp--sm" chunk="Vp#1">durat</w><w lemma="22" ana="Mc" chunk="Np#3">22</w><w lemma="oră" ana="Ncfp-n" chunk="Np#3">ore</w><c>,</c><w lemma="și" ana="Crssp">și</w><w lemma="el" ana="Pp3msr--------s" chunk="Vp#2">el</w><w lemma="fi" ana="Vaii3s" chunk="Vp#2">era</w><w lemma="înfometa" ana="Vmp--sm" chunk="Vp#2">înfometat</w><c>.</c></s></seg>
 </segs>
--- a/NLP/data/text3.txt
+++ b/NLP/data/text3.txt
@@ -0,0 +1 @@
 Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul.
--- a/NLP/data/text3_processed.xml
+++ b/NLP/data/text3_processed.xml
@@ -0,0 +1,5 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <segs>
 <seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sophia" ana="Np" chunk="Np#1">Sophia</w><w lemma="Loren" ana="Np" chunk="Np#1">Loren</w><w lemma="spune" ana="Vmnp" chunk="Vp#1">spune</w><w lemma="că" ana="Csssp">că</w><w lemma="el" ana="Pp3fsr--------s" chunk="Vp#2">ea</w><w lemma="vrea" ana="Va--3s" chunk="Vp#2">va</w><w lemma="fi" ana="Vmnp" chunk="Vp#2">fi</w><w lemma="întotdeauna" ana="Rgp" chunk="Vp#2,Ap#1">întotdeauna</w><w lemma="mulțumitor" ana="Afpf--n" chunk="Ap#1">mulțumitoare</w><w lemma="față_de" ana="Spca" chunk="Pp#1">față_de</w><w lemma="bonă" ana="Ncfsvy" chunk="Pp#1,Np#2">Bono</w><c>.</c></s></seg>
 <seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="actriță" ana="Ncfsry" chunk="Np#1">Actrița</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dezvălui" ana="Vmp--sm" chunk="Vp#1">dezvăluit</w><w lemma="că" ana="Csssp">că</w><w lemma="cântăreț" ana="Ncmsry" chunk="Np#2">cântărețul</w><w lemma="trupă" ana="Ncfsoy" chunk="Np#2">trupei</w><w lemma="U2" ana="Np" chunk="Np#2">U2</w><w lemma="avea" ana="Va--3s" chunk="Vp#2">a</w><w lemma="ajuta" ana="Vmp--sm" chunk="Vp#2">ajutat</w><w lemma="el" ana="Pp3fsa--y-----w">-o</w><w lemma="să" ana="Qs" chunk="Vp#3">să</w><w lemma="sine" ana="Px3--a--------w" chunk="Vp#3">se</w><w lemma="liniști" ana="Vmsp3" chunk="Vp#3">liniștească</w><w lemma="atunci_când" ana="Rw" chunk="Vp#3,Ap#1">atunci_când</w><w lemma="el" ana="Pp3fsr--------s">ea</w><w lemma="sine" ana="Px3--a--y-----w" chunk="Vp#4">s-</w><w lemma="avea" ana="Va--3s" chunk="Vp#4">a</w><w lemma="speria" ana="Vmp--sm" chunk="Vp#4">speriat</w><w lemma="de" ana="Spsa" chunk="Pp#1">de</w><w lemma="un" ana="Tifsr" chunk="Pp#1,Np#3">o</w><w lemma="furtună" ana="Ncfsrn" chunk="Pp#1,Np#3">furtună</w><w lemma="în_timp_ce" ana="Cscsp">în_timp_ce</w><w lemma="zbura" ana="Vmii3p" chunk="Vp#5">zburau</w><w lemma="cu" ana="Spsa" chunk="Pp#2">cu</w><w lemma="avion" ana="Ncmsry" chunk="Pp#2,Np#4">avionul</w><c>.</c></s></seg>
 </segs>
--- a/NLP/src/anaphora.py
+++ b/NLP/src/anaphora.py
@@ -0,0 +1,121 @@
 '''
 Created on May 22, 2016
@author: tibi
 '''
 from model import Word
 def getGender(word):
    if word.isPronoun() and (word.pronounGetPerson() == '1' or word.pronounGetPerson() == '2'):
        return 'n'
    return word.getGender()
 def genderMatch(word1, word2):
    g1 = getGender(word1)
    g2 = getGender(word2)
    if g1 == g2:
        return 2
    if g1 == 'n' or g2 == 'n':
        return 1
    return 0
 def isPrepositional(chunk):
    for word in chunk:
        if word.isPreposition():
            return True
    return False
 def countInText(noun, text):
    c = 0
    for word in text:
        if word.text == noun.text:
            c += 1
    return c
 def anaphora(text, chunks):
    nounPhrases = []
    for word in text:
        if word.isNoun():
            print("[n]", word)
            nounPhrases.append((word, (word.sentenceIndex, word.chunk)))
        else:
            print(word)
        if word.isPronoun():
            candidates = []
            for noun, chunkIndex in nounPhrases[:-30:-1]:
                # If gender and number match
                if genderMatch(word, noun) > 0 and word.getNumber() == noun.getNumber():
                    npInd = genderMatch(word, noun)
                    # definiteness
                    if not noun.nounIsDefinite():
                        npInd -= 1
                    # non-prepositional noun phrase
                    chunk = chunks[chunkIndex]
                    if (isPrepositional(chunk)):
                        npInd -= 1
                    # first in sentence
                    if noun.sentenceIndex == 1:
                        npInd += 1
                    # indicating verbs
                    # todo...
                    # lexical reiteration
                    c = countInText(noun, text)
                    if c == 2:
                        npInd += 1
                    if c > 2:
                        npInd += 2
                    # noun is representing term
                    # how?
                    # identical collocation pattern to the pronoun
                    # ???
                    # immediate reference, resolving 'it'
                    # applicable?
                    # referential distance
                    dist = word.sentenceIndex - noun.sentenceIndex
                    if dist == 0:
                        npInd += 2
                    elif dist == 1:
                        npInd += 1
                    candidates.append((noun, npInd))
                    print("...> Candidate: {0} npInd = {1}".format(noun, npInd))
            if len(candidates) > 0:
                pickedWord, pickedInd = candidates[0]
                for word, npInd in candidates:
                    if npInd > pickedInd:
                        pickedInd = npInd
                        pickedWord = word
                print(".>>> Picked: {0}".format(pickedWord))
--- a/NLP/src/fileparser.py
+++ b/NLP/src/fileparser.py
@@ -0,0 +1,55 @@
 '''
 Created on May 22, 2016
@author: tibi
 '''
 from xml.dom import minidom;
 from model.Word import Word
 def parse(filename):
    words = []
    chunks = {}
    sentence_i = 0
    # get the root "segs" element
    dom = minidom.parse(filename)
    alltext = dom.getElementsByTagName("segs")
    # iterate paragraphs
    for paragraph in alltext[0].getElementsByTagName("seg"):
        # iterate sentences
        for sentence in paragraph.getElementsByTagName("s"):
            # increment sentence index
            sentence_i += 1
            word_i = 0
            # iterate words
            for word in sentence.getElementsByTagName("w"):
                # increment word index
                word_i += 1
                # obtain word info
                wordText = word.firstChild.data
                lemma = word.getAttribute("lemma")
                ana = word.getAttribute("ana")
                chunk = word.getAttribute("chunk")
                # create word
                #w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
                #words.append(w)
                for c in chunk.split(","):
                    w = Word(wordText, lemma, ana, c, sentence_i, word_i)
                    words.append(w)
                    if chunks.get((sentence_i, c)) == None:
                        chunks[(sentence_i, c)] = [ w ]
                    else:
                        chunks[(sentence_i, c)].append(w)
    return (words, chunks)
--- a/NLP/src/main.py
+++ b/NLP/src/main.py
@@ -0,0 +1,26 @@
 '''
 Created on May 22, 2016
@author: tibi
 '''
 import fileparser
 from anaphora import anaphora
 if __name__ == '__main__':
    words, chunks = fileparser.parse("../data/text3_processed.xml")
    print("Words:")
    for word in words:
        print("[{0} {1}] {2}".format(word.sentenceIndex, word.wordIndex, word))
    print("")
    print("Chunks:")
    for key, value in chunks.items():
        print(key, ":")
        for word in value:
            print("   - ", word)
    print("")
    print("Anaphora resolution:") 
    anaphora(words, chunks)
--- a/NLP/src/model/Word.py
+++ b/NLP/src/model/Word.py
@@ -0,0 +1,88 @@
 '''
 Created on May 22, 2016
@author: tibi
 '''
 class Word:
    text = ""
    lemma = ""
    ana = ""
    chunk = ""
    sentenceIndex = 0
    wordIndex = 0
    def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
        self.text = text
        self.lemma = lemma
        self.ana = ana
        self.chunk = chunk
        self.sentenceIndex = sentenceIndex
        self.wordIndex = wordIndex
    def __str__(self):
        return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
    def isNoun(self):
        return self.ana[0] == "N"
    def nounIsCommon(self):
        return self.isNoun() and self.ana[1] == "c"
    def nounIsProper(self):
        return self.isNoun() and self.ana[1] == "p"
    def nounGetCase(self):
        if self.isNoun():
            return self.ana[4]
        return None
    'Este articulat?'
    def nounIsDefinite(self):
        if self.isNoun():
            if (self.nounIsProper()):
                return True
            if len(self.ana) > 5:
                return self.ana[5]
        return "n"
    def pronounGetPerson(self):
        if self.isPronoun():
            return self.ana[2]
    def getGender(self):
        if self.isNoun():
            if (len(self.ana) >= 3):
                return self.ana[2]
            return 'n'
        if self.isPronoun():
            return self.ana[3]
        return None
    def getNumber(self):
        if self.isNoun():
            if self.nounIsProper():
                return 's'
            else:
                return self.ana[3]
        if self.isPronoun():
            return self.ana[4]
        return None
    def isPronoun(self):
        return self.ana[0] == "P"
    def isVerb(self):
        return self.ana[0] == "V"
    def isPreposition(self):
        return self.ana[0] == "S" and self.ana[1] == "p"
--- a/NLP/src/model/init.py
+++ b/NLP/src/model/init.py
--- a/data/index.csv
+++ b/data/index.csv
@@ -0,0 +1 @@
 books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania
--- a/data/results.db
+++ b/data/results.db
--- a/data/texts.db
+++ b/data/texts.db
--- a/data/texts.db.bak
+++ b/data/texts.db.bak
--- a/data/texts2.db
+++ b/data/texts2.db
--- a/src/logger.py
+++ b/src/logger.py
@@ -0,0 +1,20 @@
 import time
 import logging
 def init_logger(level):
    # Log filename
    tm = time.strftime('%Y-%m-%d_%H-%M-%S')
    logFile = "logs/log_{0}.log".format(tm)
    # Set up file logger
    logging.basicConfig(filename=logFile,
                        level=logging.DEBUG,
                        format='%(asctime)s %(name)s %(levelname)s %(message)s',
                        datefmt='%m-%d %H:%M')
    # Set up console logger
    formatter = logging.Formatter('[%(name)s] %(levelname)s: %(message)s')
    console = logging.StreamHandler()
    console.setLevel(level)
    console.setFormatter(formatter)
    logging.getLogger().addHandler(console)
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,40 @@
 import logging
 import time
 # own
 import logger
 import storage.data
 import storage.results
 import textprocessor.letterfreq
 import ttl.ttlparser
 import ttl.ttlservice
 def init():
    logger.init_logger(logging.WARNING)
    storage.data.initializeFragmentDatabase("data/texts.db")
    storage.results.initializeResultsDatabase("data/results.db", True)
 def processTexts():
    count = storage.data.getTextCount()
    current = 0
    for item in storage.data.getAllTexts():
        print("Processing item", current, "out of", count)
        current = current + 1
        itemid = item[0]
        itemtext = item[1]
        # obtain ttl analysis
        # unfeasable - it takes 5-10 minutes for a single text
        # ttlResult = ttl.ttlservice.executeTtl(itemtext)
        # (words, chunks) = ttl.ttlparser.parseText(ttlResult)
        # storage.results.storeTtlAnalysis(itemid, words)
        # perform analysis
        letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
        storage.results.storeFrequencies(itemid, letterFreq)
    print("Finished!")
 init()
 processTexts()
--- a/src/model.py
+++ b/src/model.py
@@ -0,0 +1,32 @@
 # Defines a fragment author
 class Author:
    def __init__(self, name = "", birthYear = "", location = "Romania"):
        self.name = name
        self.yearOfBirth = birthYear
        self.location = location
    def __str__(self):
        return self.name
    def __repr__(self):
        return self.name
    def dump(self):
        return "[Author name={0} yearOfBirth={1} location={2}]".format(self.name, self.yearOfBirth, self.location)
 # Defines a text fragment
 class Fragment:
    def __init__(self, title = "", text = "", author = Author(), year = 1999):
        self.title = title
        self.text = text
        self.author = author
        self.year = year
    def __str__(self):
        return self.title
    def __repr__(self):
        return self.title
    def dump(self):
        return "[Fragment title={0} author={1} year={2} text={3}]".format(self.title, self.author.dump(), self.year, self.text)
--- a/src/model/Word.py
+++ b/src/model/Word.py
@@ -0,0 +1,87 @@
 # Defines a processed word
 class Word:
    text = ""
    lemma = ""
    ana = ""
    chunk = ""
    sentenceIndex = 0
    wordIndex = 0
    def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
        self.text = text
        self.lemma = lemma
        self.ana = ana
        self.chunk = chunk
        self.sentenceIndex = sentenceIndex
        self.wordIndex = wordIndex
    def __str__(self):
        return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
    def __repr__(self):
        return str(self)
    def isNoun(self):
        return self.ana[0] == "N"
    def nounIsCommon(self):
        return self.isNoun() and self.ana[1] == "c"
    def nounIsProper(self):
        return self.isNoun() and self.ana[1] == "p"
    def nounGetCase(self):
        if self.isNoun():
            return self.ana[4]
        return None
    'Este articulat?'
    def nounIsDefinite(self):
        if self.isNoun():
            if (self.nounIsProper()):
                return True
            if len(self.ana) > 5:
                return self.ana[5]
        return "n"
    def pronounGetPerson(self):
        if self.isPronoun():
            return self.ana[2]
    def getGender(self):
        if self.isNoun():
            if (len(self.ana) >= 3):
                return self.ana[2]
            return 'n'
        if self.isPronoun():
            return self.ana[3]
        return None
    def getNumber(self):
        if self.isNoun():
            if self.nounIsProper():
                return 's'
            else:
                return self.ana[3]
        if self.isPronoun():
            return self.ana[4]
        return None
    def isPronoun(self):
        return self.ana[0] == "P"
    def isVerb(self):
        return self.ana[0] == "V"
    def isPreposition(self):
        return self.ana[0] == "S" and self.ana[1] == "p"
--- a/src/model/init.py
+++ b/src/model/init.py
--- a/src/storage/init.py
+++ b/src/storage/init.py
--- a/src/storage/data.py
+++ b/src/storage/data.py
@@ -0,0 +1,80 @@
 import logging
 import os
 from model import *
 import sqlite3
 log = logging.getLogger("storage")
 DB_FRAGMENTS = ""
 # Commands
 # birth location - general area, not exact location (i.e. Transylvania)
 # birth origin - rural or urban
 # studies - masters, bachelors, high school, middle school, primary school
 # occupation - comma separated if there are multiple
 # studiesAbroad - foreign cities where author studied (comma separated)
 COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
    name TEXT PRIMARY KEY,
    birthYear INTEGER,
    birthLocation TEXT,
    birthOrigin TEXT,
    studies TEXT,
    occupations TEXT,
    studiesAbroad TEXT
    )"""
 # genre - short story (nuvela), novel (roman), poem etc
 # movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
 # tags - other relevant information (i.e. psychological)
 COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
    id INTEGER PRIMARY KEY,
    title TEXT,
    year INTEGER,
    author TEXT REFERENCES Authors(name),
    genre TEXT,
    movement TEXT,
    tags TEXT
    )"""
 # contains the actual text
 COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
    id INTEGER REFERENCES Fragments(id),
    content TEXT
    )"""
 # Initialize databases
 def initializeFragmentDatabase(dbFile):
    global DB_FRAGMENTS
    DB_FRAGMENTS = dbFile
    if not os.path.exists(dbFile):
        log.info("Text database %s not found. Will create database.", dbFile)
        con = sqlite3.connect(dbFile)
        c = con.cursor()
        c.execute(COMMAND_CREATE_AUTHORS)
        c.execute(COMMAND_CREATE_FRAGMENTS)
        c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
        con.commit()
        con.close()
        log.info("Database created!")
 def getTextCount():
    con = sqlite3.connect(DB_FRAGMENTS)
    c = con.cursor()
    c.execute("SELECT COUNT(*) FROM Fragments")
    item = c.fetchone()
    c.close()
    con.close()
    return item[0]
 def getAllTexts():
    con = sqlite3.connect(DB_FRAGMENTS)
    c = con.cursor()
    c.execute("SELECT id, content FROM FragmentsContent")
    items = c.fetchall()
    c.close()
    con.close()
    return items
--- a/src/storage/results.py
+++ b/src/storage/results.py
@@ -0,0 +1,84 @@
 import logging
 import os
 from model.Word import *
 import sqlite3
 log = logging.getLogger("storage")
 DB_RESULTS = ""
 COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
    idtext INTEGER,
    lettergroup TEXT,
    category TEXT,
    frequency REAL
    )"""
 COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
    idtext INTEGER,
    wordIndex INTEGER,
    sentenceIndex INTEGER,
    word TEXT,
    lemma TEXT,
    analysis TEXT,
    chunk TEXT
    )"""
 # COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
 #     idtext INTEGER,
 #     wordlength INTEGER,
 #     frequency REAL
 #     )"""
 def initializeResultsDatabase(dbFile, cleanupOldData):
    global DB_RESULTS
    DB_RESULTS = dbFile
    # cleanup old data
    if cleanupOldData:
        con = sqlite3.connect(DB_RESULTS)
        c = con.cursor()
        try:
            c.execute("DROP TABLE LetterFrequencies")
        except sqlite3.OperationalError:
            pass
        c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
        try:
            c.execute("DROP TABLE TextWords")
        except sqlite3.OperationalError:
            pass
        c.execute(COMMAND_CREATE_TEXT_WORDS)
        con.commit()
        c.close()
        con.close()
 def storeFrequencies(idtext, freq):
    con = sqlite3.connect(DB_RESULTS)
    c = con.cursor()
    # add data
    chr = ['p', 'l1', 'l2', 'l3']
    for i in range(4):
        for let, fr in freq[i]:
            c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
    con.commit()
    c.close()
    con.close()
 def storeTtlAnalysis(idtext, words):
    con = sqlite3.connect(DB_RESULTS)
    c = con.cursor()
    # store words
    for word in words:
        c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
    # finish
    con.commit()
    c.close()
    con.close()
--- a/src/test.py
+++ b/src/test.py
@@ -0,0 +1,14 @@
 # coding: utf-8
 from ttl import ttlservice
 from ttl import ttlparser
 import nltk
 import storage
 data = storage.parseIndex("data")
 print(data)
 #textXml = ttlservice.executeTtl(u"Numele meu este Tibi și îmi place să cânt la chitară bass. Ce faci?")
 #words, chunks = ttlparser.parseText(textXml)
 #print ("Words: ", words)
 #print ("Chunks: ", chunks)
--- a/src/textprocessor/init.py
+++ b/src/textprocessor/init.py
--- a/src/textprocessor/letterfreq.py
+++ b/src/textprocessor/letterfreq.py
@@ -0,0 +1,38 @@
 import operator
 import storage
 def letterFrequencies(text):
    letterfreq = [{}, {}, {}, {}]
    lettersum = [0, 0, 0, 0]
    n = len(text)
    for i in range(n):
        # compute substring frequency
        # l = substring length
        for l in range(1, 4):
            sub = text[i : i + l].lower()
            if len(sub) == l and sub.isalnum():
                lettersum[l] += 1 
                if not sub in letterfreq[l]:
                    letterfreq[l][sub] = 1
                else:
                    letterfreq[l][sub] += 1
        # compute punctuation frequency
        chr = text[i]
        if not chr.isalnum() and not chr.isspace() and chr.isprintable():
            lettersum[0] += 1
            if not chr in letterfreq[0]:
                letterfreq[0][chr] = 1
            else:
                letterfreq[0][chr] += 1
    # Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
    for i in range(4):
        freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
        freqFiltered = freqSorted[0:50]
        freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
        letterfreq[i] = freqNormalized
    return letterfreq
--- a/src/textprocessor/wordanalysis.py
+++ b/src/textprocessor/wordanalysis.py
@@ -0,0 +1,2 @@
 def analyzeWords(text):
    pass
--- a/src/tools/wikisource_downloader.py
+++ b/src/tools/wikisource_downloader.py
@@ -0,0 +1,128 @@
 import urllib
 from pyquery import PyQuery
 import sqlite3
 import re
 BASE_URL = "https://ro.wikisource.org"
 LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 def getAuthorList():
    authors = []
    for letter in LETTERS:
        print("Processing link page for letter", letter)    
        # Read index page
        url = BASE_URL + '/wiki/Categorie:Autori-' + letter
        data = urllib.request.urlopen(url).read()
        q = PyQuery(data)
        for item in q("div.mw-category-generated").find("a"):
            if (item.text.startswith("Autor:")):
                authorname = item.text[6:]
                authorlink = item.attrib['href']
                authors.append((authorname, authorlink))
    return list(set(authors))
 def getAuthorWikiLink(query):
    wikilink = None
    body = query("div#mw-content-text")
    table = body.find("table")
    for link in table.find("a"):
        if "ro.wikipedia.org" in link.attrib['href']:
            wikilink = link.attrib['href']
    return wikilink
 def getAuthorLinksList(authorname, query):
    links = []
    body = query("div#mw-content-text")
    for link in body.find("a"):
        address = link.attrib['href']
        ok = True
        if "http" in address:
            ok = False
        if "redlink" in address:
            ok = False
        if "Fi%C8%99ier:" in address:
            ok = False
        if "index.php" in address:
            ok = False
        if address.startswith("#"):
            ok = False
        if "Autor:" in address:
            ok = False
        if ok:
            links.append(link.attrib['href'])
    return links
 def getAuthorBasicInfo(authorname, authorlink):
    info = {}
    data = urllib.request.urlopen(BASE_URL + authorlink).read()
    q = PyQuery(data)
    info["wiki"] = getAuthorWikiLink(q)
    info["links"] = getAuthorLinksList(authorname, q)
    return info
 # def getAuthorWikiInfo(authorinfo):
 #     # Nothing can be learned without wiki page
 #     if authorinfo["wiki"] is None:
 #         return authorinfo
 #     try:
 #         data = urllib.request.urlopen(authorinfo["wiki"]).read()
 #         q = PyQuery(data)
 #         # Find the birth date
 #         body = q("#mw-content-text").text()
 #         result = re.compile(u"Născut\s+([\w\s]+)").match(body)
 #         if not result is None:
 #             authorinfo["birthyear"] = result.group(0)
 #     except urllib.error.HTTPError:
 #         pass
 #     return authorinfo
 def getText(url):
    data = urllib.request.urlopen(BASE_URL + url).read()
    q = PyQuery(data)
    texttitle = q("h1").text()
    body = q("#mw-content-text")
    body.find("table").remove()
    textcontent = body.text()
    return (texttitle, textcontent)
 def addAuthorToDb(authorinfo):
    con = sqlite3.connect("data/texts.db")
    c = con.cursor()
    c.execute("INSERT INTO Authors")
 def getAllTexts():
    con = sqlite3.connect("data/texts.db")
    c = con.cursor()
    #c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
    id = 1
    authors = getAuthorList()
    for authorname, authorlink in authors:
        print("Processing author", authorname)
        authorinfo = getAuthorBasicInfo(authorname, authorlink)
        c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
        # authorinfo = getAuthorWikiInfo(authorinfo)
        for text in authorinfo["links"]:
            try:
                title, content = getText(text)
                c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
                c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
                id = id + 1
            except urllib.error.HTTPError:
                continue
    con.commit()
 getAllTexts()
--- a/src/ttl/init.py
+++ b/src/ttl/init.py
--- a/src/ttl/ttlparser.py
+++ b/src/ttl/ttlparser.py
@@ -0,0 +1,62 @@
 '''
 Created on May 22, 2016
@author: tibi
 '''
 from xml.dom import minidom;
 from xml.parsers.expat import ExpatError
 from model.Word import Word
 def parseText(xmlText):
    words = []
    chunks = {}
    sentence_i = 0
    # get the root "segs" element
    try:
        dom = minidom.parseString(xmlText)
    except ExpatError as e:
        print("Error in text:", xmlText)
        print(e)
        exit(-1)
    alltext = dom.getElementsByTagName("segs")
    # iterate paragraphs
    for paragraph in alltext[0].getElementsByTagName("seg"):
        # iterate sentences
        for sentence in paragraph.getElementsByTagName("s"):
            # increment sentence index
            sentence_i += 1
            word_i = 0
            # iterate words
            for word in sentence.getElementsByTagName("w"):
                # increment word index
                word_i += 1
                # obtain word info
                wordText = word.firstChild.data
                lemma = word.getAttribute("lemma")
                ana = word.getAttribute("ana")
                chunk = word.getAttribute("chunk")
                # create word
                #w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
                #words.append(w)
                for c in chunk.split(","):
                    w = Word(wordText, lemma, ana, c, sentence_i, word_i)
                    words.append(w)
                    if chunks.get((sentence_i, c)) == None:
                        chunks[(sentence_i, c)] = [ w ]
                    else:
                        chunks[(sentence_i, c)].append(w)
    return (words, chunks)
--- a/src/ttl/ttlservice.py
+++ b/src/ttl/ttlservice.py
@@ -0,0 +1,34 @@
 # coding: utf-8
 import zeep
 def executeTtl(text):
    # Preprocess the text
    text = text.replace(u'ĭ', 'i')
    text = text.replace(u'ŭ', 'u')
    text = text.replace(u'à', 'a')
    client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
    textSgml = client.service.UTF8toSGML(text)
    result = client.service.XCES("ro", "id", textSgml)
    # Cleanup result - generate valid xml
    result = result.replace('&rsquor;', '`')
    result = result.replace('&abreve;', u'ă')
    result = result.replace('&agrave;', u'à')
    result = result.replace('&acirc;', u'â')
    result = result.replace('&icirc;', u'î')
    result = result.replace('&scedil;', u'ș')
    result = result.replace('&tcedil;', u'ț')
    result = result.replace('&ubreve;', u'u')
    result = result.replace('&Abreve;', u'Ă')
    result = result.replace('&Agrave;', u'À')
    result = result.replace('&Acirc;', u'Â')
    result = result.replace('&Icirc;', u'Î')
    result = result.replace('&Scedil;', u'Ș')
    result = result.replace('&Tcedil;', u'Ț')
    result = result.replace('&Ubreve;', u'U')
    xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
    xmlResult += result
    xmlResult += "</segs>"
    return xmlResult
		`@@ -0,0 +1 @@`
							`Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase.`
		`@@ -0,0 +1 @@`
							`Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat.`
		`@@ -0,0 +1 @@`
							`Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul.`
		`@@ -0,0 +1 @@`
							`books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania`