Initial commit

2017-05-23 13:57:53 +03:00
commit 6badfbd103
38 changed files with 1286 additions and 0 deletions
@@ -0,0 +1,107 @@
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+
+logs/*
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
@@ -0,0 +1,220 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Run without debugging",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": false,
+            "pythonPath": "${config:python.pythonPath}",
+            "program": "${file}",
+            "cwd": "${workspaceRoot}",
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput"
+            ]
+        },
+        {
+            "name": "Python",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": true,
+            "pythonPath": "${config:python.pythonPath}",
+            "program": "${file}",
+            "cwd": "${workspaceRoot}",
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput"
+            ]
+        },
+        {
+            "name": "PySpark",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": true,
+            "osx": {
+                "pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
+            },
+            "windows": {
+                "pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd"
+            },
+            "linux": {
+                "pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
+            },
+            "program": "${file}",
+            "cwd": "${workspaceRoot}",
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput"
+            ]
+        },
+        {
+            "name": "Python Module",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": true,
+            "pythonPath": "${config:python.pythonPath}",
+            "module": "module.name",
+            "cwd": "${workspaceRoot}",
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput"
+            ]
+        },
+        {
+            "name": "Integrated Terminal/Console",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": true,
+            "pythonPath": "${config:python.pythonPath}",
+            "program": "${file}",
+            "cwd": "",
+            "console": "integratedTerminal",
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit"
+            ]
+        },
+        {
+            "name": "External Terminal/Console",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": true,
+            "pythonPath": "${config:python.pythonPath}",
+            "program": "${file}",
+            "cwd": "",
+            "console": "externalTerminal",
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit"
+            ]
+        },
+        {
+            "name": "Django",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": true,
+            "pythonPath": "${config:python.pythonPath}",
+            "program": "${workspaceRoot}/manage.py",
+            "cwd": "${workspaceRoot}",
+            "args": [
+                "runserver",
+                "--noreload"
+            ],
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput",
+                "DjangoDebugging"
+            ]
+        },
+        {
+            "name": "Flask",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": false,
+            "pythonPath": "${config:python.pythonPath}",
+            "program": "fully qualified path fo 'flask' executable. Generally located along with python interpreter",
+            "cwd": "${workspaceRoot}",
+            "env": {
+                "FLASK_APP": "${workspaceRoot}/quickstart/app.py"
+            },
+            "args": [
+                "run",
+                "--no-debugger",
+                "--no-reload"
+            ],
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput"
+            ]
+        },
+        {
+            "name": "Flask (old)",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": false,
+            "pythonPath": "${config:python.pythonPath}",
+            "program": "${workspaceRoot}/run.py",
+            "cwd": "${workspaceRoot}",
+            "args": [],
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput"
+            ]
+        },
+        {
+            "name": "Pyramid",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": true,
+            "pythonPath": "${config:python.pythonPath}",
+            "cwd": "${workspaceRoot}",
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "args": [
+                "${workspaceRoot}/development.ini"
+            ],
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput",
+                "Pyramid"
+            ]
+        },
+        {
+            "name": "Watson",
+            "type": "python",
+            "request": "launch",
+            "stopOnEntry": true,
+            "pythonPath": "${config:python.pythonPath}",
+            "program": "${workspaceRoot}/console.py",
+            "cwd": "${workspaceRoot}",
+            "args": [
+                "dev",
+                "runserver",
+                "--noreload=True"
+            ],
+            "env": {},
+            "envFile": "${workspaceRoot}/.env",
+            "debugOptions": [
+                "WaitOnAbnormalExit",
+                "WaitOnNormalExit",
+                "RedirectOutput"
+            ]
+        },
+        {
+            "name": "Attach (Remote Debug)",
+            "type": "python",
+            "request": "attach",
+            "localRoot": "${workspaceRoot}",
+            "remoteRoot": "${workspaceRoot}",
+            "port": 3000,
+            "secret": "my_secret",
+            "host": "localhost"
+        }
+    ]
+}
@@ -0,0 +1,3 @@
+// Place your settings in this file to overwrite default and user settings.
+{
+}
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>NLP</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.python.pydev.PyDevBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.python.pydev.pythonNature</nature>
+	</natures>
+</projectDescription>
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?><pydev_project>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/${PROJECT_DIR_NAME}/src</path>
+</pydev_pathproperty>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 3.0</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 3.5</pydev_property>
+</pydev_project>
@@ -0,0 +1 @@
+Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase.
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<segs>
+<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Maria" ana="Np" chunk="Np#1">Maria</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="măr" ana="Ncfp-n" chunk="Np#2">mere</w><c>.</c></s></seg>
+<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="el" ana="Pp3fsr--------s">Ea</w><w lemma="mai" ana="Rp" chunk="Ap#1">mai</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="șapte" ana="Mc-p-l" chunk="Np#1">șapte</w><w lemma="pară" ana="Ncfp-n" chunk="Np#1">pere</w><c>.</c></s></seg>
+<seg lang="ro"><s id="id_temp_aiurea.3"><w lemma="acesta" ana="Pd3fpr">Acestea</w><w lemma="fi" ana="Vmip3p" chunk="Vp#1">sunt</w><w lemma="foarte" ana="Rp" chunk="Ap#1,Vp#1">foarte</w><w lemma="delicios" ana="Afpfp-n" chunk="Ap#1">delicioase</w><c>.</c></s></seg>
+</segs>
@@ -0,0 +1 @@
+Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat.
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<segs>
+<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sabeer" ana="Np" chunk="Np#1">Sabeer</w><w lemma="Bhatia" ana="Np" chunk="Np#1">Bhatia</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="ajunge" ana="Vmp--sm" chunk="Vp#1">ajuns</w><w lemma="la" ana="Spsa" chunk="Pp#1">la</w><w lemma="aeroport" ana="Ncmsry" chunk="Pp#1,Np#2">Aeroportul</w><w lemma="internațional" ana="Afpms-n" chunk="Pp#1,Np#2,Ap#1">Internațional</w><w lemma="din" ana="Spsa" chunk="Pp#2">din</w><w lemma="Los" ana="Np" chunk="Pp#2,Np#3">Los</w><w lemma="Angeles" ana="Np" chunk="Pp#2,Np#3">Angeles</w><w lemma="la" ana="Spsa" chunk="Pp#3">la</w><w lemma="oră" ana="Ncfsry" chunk="Pp#3,Np#4">ora</w><w lemma="18" ana="Mc" chunk="Pp#3,Np#4">18</w><w lemma="în" ana="Spsa" chunk="Pp#4">în</w><w lemma="dată" ana="Ncfsry" chunk="Pp#4,Np#5">data</w><w lemma="de" ana="Spsa" chunk="Pp#5">de</w><w lemma="23" ana="Mc" chunk="Pp#5,Np#6">23</w><w lemma="septembrie" ana="Ncms-n" chunk="Pp#5,Np#6">septembrie</w><w lemma="1998" ana="Mc" chunk="Pp#5,Np#6">1998</w><c>.</c></s></seg>
+<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="zbor" ana="Ncmsry" chunk="Np#1">Zborul</w><w lemma="său" ana="Ds3ms-s" chunk="Np#1">său</w><w lemma="din" ana="Spsa" chunk="Pp#1">din</w><w lemma="Bangalore" ana="Np" chunk="Pp#1,Np#2">Bangalore</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dura" ana="Vmp--sm" chunk="Vp#1">durat</w><w lemma="22" ana="Mc" chunk="Np#3">22</w><w lemma="oră" ana="Ncfp-n" chunk="Np#3">ore</w><c>,</c><w lemma="și" ana="Crssp">și</w><w lemma="el" ana="Pp3msr--------s" chunk="Vp#2">el</w><w lemma="fi" ana="Vaii3s" chunk="Vp#2">era</w><w lemma="înfometa" ana="Vmp--sm" chunk="Vp#2">înfometat</w><c>.</c></s></seg>
+</segs>
@@ -0,0 +1 @@
+Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul.
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<segs>
+<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sophia" ana="Np" chunk="Np#1">Sophia</w><w lemma="Loren" ana="Np" chunk="Np#1">Loren</w><w lemma="spune" ana="Vmnp" chunk="Vp#1">spune</w><w lemma="că" ana="Csssp">că</w><w lemma="el" ana="Pp3fsr--------s" chunk="Vp#2">ea</w><w lemma="vrea" ana="Va--3s" chunk="Vp#2">va</w><w lemma="fi" ana="Vmnp" chunk="Vp#2">fi</w><w lemma="întotdeauna" ana="Rgp" chunk="Vp#2,Ap#1">întotdeauna</w><w lemma="mulțumitor" ana="Afpf--n" chunk="Ap#1">mulțumitoare</w><w lemma="față_de" ana="Spca" chunk="Pp#1">față_de</w><w lemma="bonă" ana="Ncfsvy" chunk="Pp#1,Np#2">Bono</w><c>.</c></s></seg>
+<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="actriță" ana="Ncfsry" chunk="Np#1">Actrița</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dezvălui" ana="Vmp--sm" chunk="Vp#1">dezvăluit</w><w lemma="că" ana="Csssp">că</w><w lemma="cântăreț" ana="Ncmsry" chunk="Np#2">cântărețul</w><w lemma="trupă" ana="Ncfsoy" chunk="Np#2">trupei</w><w lemma="U2" ana="Np" chunk="Np#2">U2</w><w lemma="avea" ana="Va--3s" chunk="Vp#2">a</w><w lemma="ajuta" ana="Vmp--sm" chunk="Vp#2">ajutat</w><w lemma="el" ana="Pp3fsa--y-----w">-o</w><w lemma="să" ana="Qs" chunk="Vp#3">să</w><w lemma="sine" ana="Px3--a--------w" chunk="Vp#3">se</w><w lemma="liniști" ana="Vmsp3" chunk="Vp#3">liniștească</w><w lemma="atunci_când" ana="Rw" chunk="Vp#3,Ap#1">atunci_când</w><w lemma="el" ana="Pp3fsr--------s">ea</w><w lemma="sine" ana="Px3--a--y-----w" chunk="Vp#4">s-</w><w lemma="avea" ana="Va--3s" chunk="Vp#4">a</w><w lemma="speria" ana="Vmp--sm" chunk="Vp#4">speriat</w><w lemma="de" ana="Spsa" chunk="Pp#1">de</w><w lemma="un" ana="Tifsr" chunk="Pp#1,Np#3">o</w><w lemma="furtună" ana="Ncfsrn" chunk="Pp#1,Np#3">furtună</w><w lemma="în_timp_ce" ana="Cscsp">în_timp_ce</w><w lemma="zbura" ana="Vmii3p" chunk="Vp#5">zburau</w><w lemma="cu" ana="Spsa" chunk="Pp#2">cu</w><w lemma="avion" ana="Ncmsry" chunk="Pp#2,Np#4">avionul</w><c>.</c></s></seg>
+</segs>
@@ -0,0 +1,121 @@
+'''
+Created on May 22, 2016
+
+@author: tibi
+'''
+from model import Word
+
+def getGender(word):
+    
+    if word.isPronoun() and (word.pronounGetPerson() == '1' or word.pronounGetPerson() == '2'):
+        return 'n'
+
+    return word.getGender()
+
+def genderMatch(word1, word2):
+  
+    g1 = getGender(word1)
+    g2 = getGender(word2)
+    
+    if g1 == g2:
+        return 2
+
+    if g1 == 'n' or g2 == 'n':
+        return 1
+     
+    return 0
+
+def isPrepositional(chunk):
+    
+    for word in chunk:
+        
+        if word.isPreposition():
+            return True
+        
+    return False
+
+def countInText(noun, text):
+    
+    c = 0
+    for word in text:
+        if word.text == noun.text:
+            c += 1
+    
+    return c
+
+def anaphora(text, chunks):
+    
+    nounPhrases = []
+    
+    for word in text:
+        
+        if word.isNoun():
+            print("[n]", word)
+            nounPhrases.append((word, (word.sentenceIndex, word.chunk)))
+            
+        else:
+            print(word)
+            
+        if word.isPronoun():
+            
+            candidates = []
+            
+            for noun, chunkIndex in nounPhrases[:-30:-1]:
+                
+                # If gender and number match
+                if genderMatch(word, noun) > 0 and word.getNumber() == noun.getNumber():
+                    
+                    npInd = genderMatch(word, noun)
+                                        
+                    # definiteness
+                    if not noun.nounIsDefinite():
+                        npInd -= 1
+                    
+                    # non-prepositional noun phrase
+                    chunk = chunks[chunkIndex]
+                    if (isPrepositional(chunk)):
+                        npInd -= 1
+                    
+                    # first in sentence
+                    if noun.sentenceIndex == 1:
+                        npInd += 1
+                        
+                    # indicating verbs
+                    # todo...
+                    
+                    # lexical reiteration
+                    c = countInText(noun, text)
+                    if c == 2:
+                        npInd += 1
+                    if c > 2:
+                        npInd += 2
+                        
+                    # noun is representing term
+                    # how?
+                    
+                    # identical collocation pattern to the pronoun
+                    # ???
+                    
+                    # immediate reference, resolving 'it'
+                    # applicable?
+                    
+                    # referential distance
+                    dist = word.sentenceIndex - noun.sentenceIndex
+                    if dist == 0:
+                        npInd += 2
+                    elif dist == 1:
+                        npInd += 1
+                    
+                    candidates.append((noun, npInd))
+                    print("...> Candidate: {0} npInd = {1}".format(noun, npInd))
+            
+            
+            if len(candidates) > 0:
+                
+                pickedWord, pickedInd = candidates[0]
+                for word, npInd in candidates:
+                    if npInd > pickedInd:
+                        pickedInd = npInd
+                        pickedWord = word
+                
+                print(".>>> Picked: {0}".format(pickedWord))
@@ -0,0 +1,55 @@
+'''
+Created on May 22, 2016
+
+@author: tibi
+'''
+
+from xml.dom import minidom;
+from model.Word import Word
+
+def parse(filename):
+
+    words = []
+    chunks = {}
+
+    sentence_i = 0
+    
+    # get the root "segs" element
+    dom = minidom.parse(filename)
+    alltext = dom.getElementsByTagName("segs")
+        
+    # iterate paragraphs
+    for paragraph in alltext[0].getElementsByTagName("seg"):
+                             
+        # iterate sentences
+        for sentence in paragraph.getElementsByTagName("s"):
+            
+            # increment sentence index
+            sentence_i += 1
+            word_i = 0
+            
+            # iterate words
+            for word in sentence.getElementsByTagName("w"):
+
+                # increment word index
+                word_i += 1
+
+                # obtain word info
+                wordText = word.firstChild.data
+                lemma = word.getAttribute("lemma")
+                ana = word.getAttribute("ana")
+                chunk = word.getAttribute("chunk")
+
+                # create word
+                #w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
+                #words.append(w)
+                
+                for c in chunk.split(","):
+                    w = Word(wordText, lemma, ana, c, sentence_i, word_i)
+                    words.append(w)
+                    if chunks.get((sentence_i, c)) == None:
+                        chunks[(sentence_i, c)] = [ w ]
+                    else:
+                        chunks[(sentence_i, c)].append(w)
+                    
+    return (words, chunks)
@@ -0,0 +1,26 @@
+'''
+Created on May 22, 2016
+
+@author: tibi
+'''
+import fileparser
+from anaphora import anaphora
+
+if __name__ == '__main__':
+    
+    words, chunks = fileparser.parse("../data/text3_processed.xml")
+    
+    print("Words:")
+    for word in words:
+        print("[{0} {1}] {2}".format(word.sentenceIndex, word.wordIndex, word))
+    print("")
+    
+    print("Chunks:")
+    for key, value in chunks.items():
+        print(key, ":")
+        for word in value:
+            print("   - ", word)
+    print("")
+           
+    print("Anaphora resolution:") 
+    anaphora(words, chunks)
@@ -0,0 +1,88 @@
+'''
+Created on May 22, 2016
+
+@author: tibi
+'''
+
+class Word:
+
+    text = ""
+    lemma = ""
+    ana = ""
+    chunk = ""
+    
+    sentenceIndex = 0
+    wordIndex = 0
+    
+    def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
+        self.text = text
+        self.lemma = lemma
+        self.ana = ana
+        self.chunk = chunk
+        self.sentenceIndex = sentenceIndex
+        self.wordIndex = wordIndex
+        
+    def __str__(self):
+        return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
+        
+    def isNoun(self):
+        return self.ana[0] == "N"
+    
+    def nounIsCommon(self):
+        return self.isNoun() and self.ana[1] == "c"
+
+    def nounIsProper(self):
+        return self.isNoun() and self.ana[1] == "p"
+    
+    def nounGetCase(self):
+        
+        if self.isNoun():
+            return self.ana[4]
+        
+        return None
+    
+    'Este articulat?'
+    def nounIsDefinite(self):
+        if self.isNoun():
+            if (self.nounIsProper()):
+                return True
+            
+            if len(self.ana) > 5:
+                return self.ana[5]
+        
+        return "n"
+    
+    def pronounGetPerson(self):
+        if self.isPronoun():
+            return self.ana[2]
+    
+    def getGender(self):
+        if self.isNoun():
+            if (len(self.ana) >= 3):
+                return self.ana[2]
+            return 'n'
+        
+        if self.isPronoun():
+            return self.ana[3]
+            
+        return None
+    
+    def getNumber(self):
+        if self.isNoun():
+            if self.nounIsProper():
+                return 's'
+            else:
+                return self.ana[3]
+        if self.isPronoun():
+            return self.ana[4]
+        
+        return None
+    
+    def isPronoun(self):
+        return self.ana[0] == "P"
+    
+    def isVerb(self):
+        return self.ana[0] == "V"
+    
+    def isPreposition(self):
+        return self.ana[0] == "S" and self.ana[1] == "p"
@@ -0,0 +1 @@
+books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania
@@ -0,0 +1,20 @@
+import time
+import logging
+
+def init_logger(level):
+    # Log filename
+    tm = time.strftime('%Y-%m-%d_%H-%M-%S')
+    logFile = "logs/log_{0}.log".format(tm)
+
+    # Set up file logger
+    logging.basicConfig(filename=logFile,
+                        level=logging.DEBUG,
+                        format='%(asctime)s %(name)s %(levelname)s %(message)s',
+                        datefmt='%m-%d %H:%M')
+    
+    # Set up console logger
+    formatter = logging.Formatter('[%(name)s] %(levelname)s: %(message)s')
+    console = logging.StreamHandler()
+    console.setLevel(level)
+    console.setFormatter(formatter)
+    logging.getLogger().addHandler(console)
@@ -0,0 +1,40 @@
+import logging
+import time
+# own
+import logger
+import storage.data
+import storage.results
+import textprocessor.letterfreq
+import ttl.ttlparser
+import ttl.ttlservice
+
+def init():
+    logger.init_logger(logging.WARNING)
+    storage.data.initializeFragmentDatabase("data/texts.db")
+    storage.results.initializeResultsDatabase("data/results.db", True)
+
+def processTexts():
+    count = storage.data.getTextCount()
+    current = 0
+    for item in storage.data.getAllTexts():
+        print("Processing item", current, "out of", count)
+        current = current + 1
+        
+        itemid = item[0]
+        itemtext = item[1]
+
+        # obtain ttl analysis
+        # unfeasable - it takes 5-10 minutes for a single text
+        # ttlResult = ttl.ttlservice.executeTtl(itemtext)
+        # (words, chunks) = ttl.ttlparser.parseText(ttlResult)
+        # storage.results.storeTtlAnalysis(itemid, words)
+
+        # perform analysis
+        letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
+        storage.results.storeFrequencies(itemid, letterFreq)
+
+    print("Finished!")
+
+init()
+processTexts()
+
@@ -0,0 +1,32 @@
+# Defines a fragment author
+class Author:
+    def __init__(self, name = "", birthYear = "", location = "Romania"):
+        self.name = name
+        self.yearOfBirth = birthYear
+        self.location = location
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return self.name
+
+    def dump(self):
+        return "[Author name={0} yearOfBirth={1} location={2}]".format(self.name, self.yearOfBirth, self.location)
+
+# Defines a text fragment
+class Fragment:
+    def __init__(self, title = "", text = "", author = Author(), year = 1999):
+        self.title = title
+        self.text = text
+        self.author = author
+        self.year = year
+
+    def __str__(self):
+        return self.title
+
+    def __repr__(self):
+        return self.title
+    
+    def dump(self):
+        return "[Fragment title={0} author={1} year={2} text={3}]".format(self.title, self.author.dump(), self.year, self.text)
@@ -0,0 +1,87 @@
+
+# Defines a processed word
+class Word:
+
+    text = ""
+    lemma = ""
+    ana = ""
+    chunk = ""
+    
+    sentenceIndex = 0
+    wordIndex = 0
+    
+    def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
+        self.text = text
+        self.lemma = lemma
+        self.ana = ana
+        self.chunk = chunk
+        self.sentenceIndex = sentenceIndex
+        self.wordIndex = wordIndex
+        
+    def __str__(self):
+        return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
+        
+    def __repr__(self):
+        return str(self)
+
+    def isNoun(self):
+        return self.ana[0] == "N"
+    
+    def nounIsCommon(self):
+        return self.isNoun() and self.ana[1] == "c"
+
+    def nounIsProper(self):
+        return self.isNoun() and self.ana[1] == "p"
+    
+    def nounGetCase(self):
+        
+        if self.isNoun():
+            return self.ana[4]
+        
+        return None
+    
+    'Este articulat?'
+    def nounIsDefinite(self):
+        if self.isNoun():
+            if (self.nounIsProper()):
+                return True
+            
+            if len(self.ana) > 5:
+                return self.ana[5]
+        
+        return "n"
+    
+    def pronounGetPerson(self):
+        if self.isPronoun():
+            return self.ana[2]
+    
+    def getGender(self):
+        if self.isNoun():
+            if (len(self.ana) >= 3):
+                return self.ana[2]
+            return 'n'
+        
+        if self.isPronoun():
+            return self.ana[3]
+            
+        return None
+    
+    def getNumber(self):
+        if self.isNoun():
+            if self.nounIsProper():
+                return 's'
+            else:
+                return self.ana[3]
+        if self.isPronoun():
+            return self.ana[4]
+        
+        return None
+    
+    def isPronoun(self):
+        return self.ana[0] == "P"
+    
+    def isVerb(self):
+        return self.ana[0] == "V"
+    
+    def isPreposition(self):
+        return self.ana[0] == "S" and self.ana[1] == "p"
@@ -0,0 +1,80 @@
+import logging
+import os
+from model import *
+import sqlite3
+
+log = logging.getLogger("storage")
+
+DB_FRAGMENTS = ""
+
+# Commands
+
+# birth location - general area, not exact location (i.e. Transylvania)
+# birth origin - rural or urban
+# studies - masters, bachelors, high school, middle school, primary school
+# occupation - comma separated if there are multiple
+# studiesAbroad - foreign cities where author studied (comma separated)
+COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
+    name TEXT PRIMARY KEY,
+    birthYear INTEGER,
+    birthLocation TEXT,
+    birthOrigin TEXT,
+    studies TEXT,
+    occupations TEXT,
+    studiesAbroad TEXT
+    )"""
+
+# genre - short story (nuvela), novel (roman), poem etc
+# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
+# tags - other relevant information (i.e. psychological)
+COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
+    id INTEGER PRIMARY KEY,
+    title TEXT,
+    year INTEGER,
+    author TEXT REFERENCES Authors(name),
+    genre TEXT,
+    movement TEXT,
+    tags TEXT
+    )"""
+
+# contains the actual text
+COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
+    id INTEGER REFERENCES Fragments(id),
+    content TEXT
+    )"""
+
+# Initialize databases
+def initializeFragmentDatabase(dbFile):
+    global DB_FRAGMENTS
+    DB_FRAGMENTS = dbFile
+
+    if not os.path.exists(dbFile):
+        log.info("Text database %s not found. Will create database.", dbFile)
+        con = sqlite3.connect(dbFile)
+        c = con.cursor()
+        c.execute(COMMAND_CREATE_AUTHORS)
+        c.execute(COMMAND_CREATE_FRAGMENTS)
+        c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
+        con.commit()
+        con.close()
+        log.info("Database created!")
+
+def getTextCount():
+    con = sqlite3.connect(DB_FRAGMENTS)
+    c = con.cursor()
+    c.execute("SELECT COUNT(*) FROM Fragments")
+    item = c.fetchone()
+    c.close()
+    con.close()
+    return item[0]
+
+def getAllTexts():
+    con = sqlite3.connect(DB_FRAGMENTS)
+    c = con.cursor()
+    c.execute("SELECT id, content FROM FragmentsContent")
+
+    items = c.fetchall()
+
+    c.close()
+    con.close()
+    return items
@@ -0,0 +1,84 @@
+import logging
+import os
+from model.Word import *
+import sqlite3
+
+log = logging.getLogger("storage")
+
+DB_RESULTS = ""
+
+COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
+    idtext INTEGER,
+    lettergroup TEXT,
+    category TEXT,
+    frequency REAL
+    )"""
+
+COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
+    idtext INTEGER,
+    wordIndex INTEGER,
+    sentenceIndex INTEGER,
+    word TEXT,
+    lemma TEXT,
+    analysis TEXT,
+    chunk TEXT
+    )"""
+
+# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
+#     idtext INTEGER,
+#     wordlength INTEGER,
+#     frequency REAL
+#     )"""
+
+def initializeResultsDatabase(dbFile, cleanupOldData):
+    global DB_RESULTS
+    DB_RESULTS = dbFile
+
+    # cleanup old data
+    if cleanupOldData:
+        con = sqlite3.connect(DB_RESULTS)
+        c = con.cursor()
+
+        try:
+            c.execute("DROP TABLE LetterFrequencies")
+        except sqlite3.OperationalError:
+            pass
+        c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
+
+        try:
+            c.execute("DROP TABLE TextWords")
+        except sqlite3.OperationalError:
+            pass
+        c.execute(COMMAND_CREATE_TEXT_WORDS)
+
+        con.commit()
+        c.close()
+        con.close()
+
+
+def storeFrequencies(idtext, freq):
+    con = sqlite3.connect(DB_RESULTS)
+    c = con.cursor()
+
+    # add data
+    chr = ['p', 'l1', 'l2', 'l3']
+    for i in range(4):
+        for let, fr in freq[i]:
+            c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
+
+    con.commit()
+    c.close()
+    con.close()
+
+def storeTtlAnalysis(idtext, words):
+    con = sqlite3.connect(DB_RESULTS)
+    c = con.cursor()
+    
+    # store words
+    for word in words:
+        c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
+
+    # finish
+    con.commit()
+    c.close()
+    con.close()
@@ -0,0 +1,14 @@
+# coding: utf-8
+from ttl import ttlservice
+from ttl import ttlparser
+import nltk
+
+import storage
+
+data = storage.parseIndex("data")
+print(data)
+
+#textXml = ttlservice.executeTtl(u"Numele meu este Tibi și îmi place să cânt la chitară bass. Ce faci?")
+#words, chunks = ttlparser.parseText(textXml)
+#print ("Words: ", words)
+#print ("Chunks: ", chunks)
@@ -0,0 +1,38 @@
+import operator
+import storage
+
+def letterFrequencies(text):
+    letterfreq = [{}, {}, {}, {}]
+    lettersum = [0, 0, 0, 0]
+
+    n = len(text)
+    for i in range(n):
+
+        # compute substring frequency
+        # l = substring length
+        for l in range(1, 4):
+            sub = text[i : i + l].lower()
+            if len(sub) == l and sub.isalnum():
+                lettersum[l] += 1 
+                if not sub in letterfreq[l]:
+                    letterfreq[l][sub] = 1
+                else:
+                    letterfreq[l][sub] += 1
+
+        # compute punctuation frequency
+        chr = text[i]
+        if not chr.isalnum() and not chr.isspace() and chr.isprintable():
+            lettersum[0] += 1
+            if not chr in letterfreq[0]:
+                letterfreq[0][chr] = 1
+            else:
+                letterfreq[0][chr] += 1
+
+    # Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
+    for i in range(4):
+        freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
+        freqFiltered = freqSorted[0:50]
+        freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
+        letterfreq[i] = freqNormalized
+    
+    return letterfreq
@@ -0,0 +1,2 @@
+def analyzeWords(text):
+    pass
@@ -0,0 +1,128 @@
+import urllib
+from pyquery import PyQuery
+import sqlite3
+import re
+
+BASE_URL = "https://ro.wikisource.org"
+LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+def getAuthorList():
+    authors = []
+    for letter in LETTERS:
+        print("Processing link page for letter", letter)    
+        # Read index page
+        url = BASE_URL + '/wiki/Categorie:Autori-' + letter
+        data = urllib.request.urlopen(url).read()
+        q = PyQuery(data)
+        for item in q("div.mw-category-generated").find("a"):
+            if (item.text.startswith("Autor:")):
+                authorname = item.text[6:]
+                authorlink = item.attrib['href']
+                authors.append((authorname, authorlink))
+    return list(set(authors))
+
+def getAuthorWikiLink(query):
+    wikilink = None
+    body = query("div#mw-content-text")
+    table = body.find("table")
+    for link in table.find("a"):
+        if "ro.wikipedia.org" in link.attrib['href']:
+            wikilink = link.attrib['href']
+    return wikilink
+
+def getAuthorLinksList(authorname, query):
+    links = []
+    body = query("div#mw-content-text")
+    for link in body.find("a"):
+        address = link.attrib['href']
+        ok = True
+        if "http" in address:
+            ok = False
+        if "redlink" in address:
+            ok = False
+        if "Fi%C8%99ier:" in address:
+            ok = False
+        if "index.php" in address:
+            ok = False
+        if address.startswith("#"):
+            ok = False
+        if "Autor:" in address:
+            ok = False
+        if ok:
+            links.append(link.attrib['href'])
+    return links
+
+def getAuthorBasicInfo(authorname, authorlink):
+    info = {}
+    data = urllib.request.urlopen(BASE_URL + authorlink).read()
+    q = PyQuery(data)
+
+    info["wiki"] = getAuthorWikiLink(q)
+    info["links"] = getAuthorLinksList(authorname, q)
+
+    return info
+
+# def getAuthorWikiInfo(authorinfo):
+
+#     # Nothing can be learned without wiki page
+#     if authorinfo["wiki"] is None:
+#         return authorinfo
+
+#     try:
+#         data = urllib.request.urlopen(authorinfo["wiki"]).read()
+#         q = PyQuery(data)
+        
+#         # Find the birth date
+#         body = q("#mw-content-text").text()
+#         result = re.compile(u"Născut\s+([\w\s]+)").match(body)
+#         if not result is None:
+#             authorinfo["birthyear"] = result.group(0)
+
+#     except urllib.error.HTTPError:
+#         pass
+
+#     return authorinfo
+
+def getText(url):
+    data = urllib.request.urlopen(BASE_URL + url).read()
+    q = PyQuery(data)
+
+    texttitle = q("h1").text()
+    
+    body = q("#mw-content-text")
+    body.find("table").remove()
+    
+    textcontent = body.text()
+    return (texttitle, textcontent)
+
+def addAuthorToDb(authorinfo):
+    con = sqlite3.connect("data/texts.db")
+    c = con.cursor()
+    c.execute("INSERT INTO Authors")
+
+def getAllTexts():
+    
+    con = sqlite3.connect("data/texts.db")
+    c = con.cursor()
+    #c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
+    id = 1
+
+    authors = getAuthorList()
+    for authorname, authorlink in authors:
+        print("Processing author", authorname)
+        authorinfo = getAuthorBasicInfo(authorname, authorlink)
+        c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
+        
+        # authorinfo = getAuthorWikiInfo(authorinfo)
+        for text in authorinfo["links"]:
+            try:
+                title, content = getText(text)
+                c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
+                c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
+                id = id + 1
+            except urllib.error.HTTPError:
+                continue
+
+    con.commit()
+
+getAllTexts()
@@ -0,0 +1,62 @@
+'''
+Created on May 22, 2016
+
+@author: tibi
+'''
+
+from xml.dom import minidom;
+from xml.parsers.expat import ExpatError
+from model.Word import Word
+
+def parseText(xmlText):
+
+    words = []
+    chunks = {}
+
+    sentence_i = 0
+    
+    # get the root "segs" element
+    try:
+        dom = minidom.parseString(xmlText)
+    except ExpatError as e:
+        print("Error in text:", xmlText)
+        print(e)
+        exit(-1)
+        
+    alltext = dom.getElementsByTagName("segs")
+        
+    # iterate paragraphs
+    for paragraph in alltext[0].getElementsByTagName("seg"):
+                             
+        # iterate sentences
+        for sentence in paragraph.getElementsByTagName("s"):
+            
+            # increment sentence index
+            sentence_i += 1
+            word_i = 0
+            
+            # iterate words
+            for word in sentence.getElementsByTagName("w"):
+
+                # increment word index
+                word_i += 1
+
+                # obtain word info
+                wordText = word.firstChild.data
+                lemma = word.getAttribute("lemma")
+                ana = word.getAttribute("ana")
+                chunk = word.getAttribute("chunk")
+
+                # create word
+                #w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
+                #words.append(w)
+                
+                for c in chunk.split(","):
+                    w = Word(wordText, lemma, ana, c, sentence_i, word_i)
+                    words.append(w)
+                    if chunks.get((sentence_i, c)) == None:
+                        chunks[(sentence_i, c)] = [ w ]
+                    else:
+                        chunks[(sentence_i, c)].append(w)
+
+    return (words, chunks)
@@ -0,0 +1,34 @@
+# coding: utf-8
+import zeep
+
+def executeTtl(text):
+    # Preprocess the text
+    text = text.replace(u'ĭ', 'i')
+    text = text.replace(u'ŭ', 'u')
+    text = text.replace(u'à', 'a')
+
+    client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
+    textSgml = client.service.UTF8toSGML(text)
+    result = client.service.XCES("ro", "id", textSgml)
+    
+    # Cleanup result - generate valid xml
+    result = result.replace('&rsquor;', '`')
+    result = result.replace('&abreve;', u'ă')
+    result = result.replace('&agrave;', u'à')
+    result = result.replace('&acirc;', u'â')
+    result = result.replace('&icirc;', u'î')
+    result = result.replace('&scedil;', u'ș')
+    result = result.replace('&tcedil;', u'ț')
+    result = result.replace('&ubreve;', u'u')
+    result = result.replace('&Abreve;', u'Ă')
+    result = result.replace('&Agrave;', u'À')
+    result = result.replace('&Acirc;', u'Â')
+    result = result.replace('&Icirc;', u'Î')
+    result = result.replace('&Scedil;', u'Ș')
+    result = result.replace('&Tcedil;', u'Ț')
+    result = result.replace('&Ubreve;', u'U')
+
+    xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
+    xmlResult += result
+    xmlResult += "</segs>"
+    return xmlResult
				`@@ -0,0 +1 @@`
				`Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase.`
				`@@ -0,0 +1 @@`
				`Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat.`
				`@@ -0,0 +1 @@`
				`Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul.`
				`@@ -0,0 +1 @@`
				`books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania`