commit 6badfbd103d0c906df1435ad9bfec16da7be2844 Author: Tiberiu Chibici Date: Tue May 23 13:57:53 2017 +0300 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3adb9c6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,107 @@ +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json + +logs/* + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..642dcaf --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,220 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Run without debugging", + "type": "python", + "request": "launch", + "stopOnEntry": false, + "pythonPath": "${config:python.pythonPath}", + "program": "${file}", + "cwd": "${workspaceRoot}", + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput" + ] + }, + { + "name": "Python", + "type": "python", + "request": "launch", + "stopOnEntry": true, + "pythonPath": "${config:python.pythonPath}", + "program": "${file}", + "cwd": "${workspaceRoot}", + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput" + ] + }, + { + "name": "PySpark", + "type": "python", + "request": "launch", + "stopOnEntry": true, + "osx": { + "pythonPath": "${env:SPARK_HOME}/bin/spark-submit" + }, + "windows": { + "pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd" + }, + "linux": { + "pythonPath": "${env:SPARK_HOME}/bin/spark-submit" + }, + "program": "${file}", + "cwd": "${workspaceRoot}", + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput" + ] + }, + { + "name": "Python Module", + "type": "python", + "request": "launch", + "stopOnEntry": true, + "pythonPath": "${config:python.pythonPath}", + "module": "module.name", + "cwd": "${workspaceRoot}", + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput" + ] + }, + { + "name": "Integrated Terminal/Console", + "type": "python", + "request": "launch", + "stopOnEntry": true, + "pythonPath": "${config:python.pythonPath}", + "program": "${file}", + "cwd": "", + "console": "integratedTerminal", + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit" + ] + }, + { + "name": "External Terminal/Console", + "type": "python", + "request": "launch", + "stopOnEntry": true, + "pythonPath": "${config:python.pythonPath}", + "program": "${file}", + "cwd": "", + "console": "externalTerminal", + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit" + ] + }, + { + "name": "Django", + "type": "python", + "request": "launch", + "stopOnEntry": true, + "pythonPath": "${config:python.pythonPath}", + "program": "${workspaceRoot}/manage.py", + "cwd": "${workspaceRoot}", + "args": [ + "runserver", + "--noreload" + ], + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput", + "DjangoDebugging" + ] + }, + { + "name": "Flask", + "type": "python", + "request": "launch", + "stopOnEntry": false, + "pythonPath": "${config:python.pythonPath}", + "program": "fully qualified path fo 'flask' executable. Generally located along with python interpreter", + "cwd": "${workspaceRoot}", + "env": { + "FLASK_APP": "${workspaceRoot}/quickstart/app.py" + }, + "args": [ + "run", + "--no-debugger", + "--no-reload" + ], + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput" + ] + }, + { + "name": "Flask (old)", + "type": "python", + "request": "launch", + "stopOnEntry": false, + "pythonPath": "${config:python.pythonPath}", + "program": "${workspaceRoot}/run.py", + "cwd": "${workspaceRoot}", + "args": [], + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput" + ] + }, + { + "name": "Pyramid", + "type": "python", + "request": "launch", + "stopOnEntry": true, + "pythonPath": "${config:python.pythonPath}", + "cwd": "${workspaceRoot}", + "env": {}, + "envFile": "${workspaceRoot}/.env", + "args": [ + "${workspaceRoot}/development.ini" + ], + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput", + "Pyramid" + ] + }, + { + "name": "Watson", + "type": "python", + "request": "launch", + "stopOnEntry": true, + "pythonPath": "${config:python.pythonPath}", + "program": "${workspaceRoot}/console.py", + "cwd": "${workspaceRoot}", + "args": [ + "dev", + "runserver", + "--noreload=True" + ], + "env": {}, + "envFile": "${workspaceRoot}/.env", + "debugOptions": [ + "WaitOnAbnormalExit", + "WaitOnNormalExit", + "RedirectOutput" + ] + }, + { + "name": "Attach (Remote Debug)", + "type": "python", + "request": "attach", + "localRoot": "${workspaceRoot}", + "remoteRoot": "${workspaceRoot}", + "port": 3000, + "secret": "my_secret", + "host": "localhost" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..20af2f6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +// Place your settings in this file to overwrite default and user settings. +{ +} \ No newline at end of file diff --git a/NLP.zip b/NLP.zip new file mode 100644 index 0000000..d5dade1 Binary files /dev/null and b/NLP.zip differ diff --git a/NLP/.project b/NLP/.project new file mode 100644 index 0000000..c0cf82b --- /dev/null +++ b/NLP/.project @@ -0,0 +1,17 @@ + + + NLP + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff --git a/NLP/.pydevproject b/NLP/.pydevproject new file mode 100644 index 0000000..b7dc105 --- /dev/null +++ b/NLP/.pydevproject @@ -0,0 +1,8 @@ + + + +/${PROJECT_DIR_NAME}/src + +python 3.0 +Python 3.5 + diff --git a/NLP/data/text1.txt b/NLP/data/text1.txt new file mode 100644 index 0000000..daa10c7 --- /dev/null +++ b/NLP/data/text1.txt @@ -0,0 +1 @@ +Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase. diff --git a/NLP/data/text1_processed.xml b/NLP/data/text1_processed.xml new file mode 100644 index 0000000..a38d3d7 --- /dev/null +++ b/NLP/data/text1_processed.xml @@ -0,0 +1,6 @@ + + +Mariaaremere. +Eamaiareșaptepere. +Acesteasuntfoartedelicioase. + diff --git a/NLP/data/text2.txt b/NLP/data/text2.txt new file mode 100644 index 0000000..b3455b6 --- /dev/null +++ b/NLP/data/text2.txt @@ -0,0 +1 @@ +Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat. \ No newline at end of file diff --git a/NLP/data/text2_processed.xml b/NLP/data/text2_processed.xml new file mode 100644 index 0000000..5c45726 --- /dev/null +++ b/NLP/data/text2_processed.xml @@ -0,0 +1,5 @@ + + +SabeerBhatiaaajunslaAeroportulInternaționaldinLosAngeleslaora18îndatade23septembrie1998. +ZborulsăudinBangaloreadurat22ore,șieleraînfometat. + diff --git a/NLP/data/text3.txt b/NLP/data/text3.txt new file mode 100644 index 0000000..5e5e8ac --- /dev/null +++ b/NLP/data/text3.txt @@ -0,0 +1 @@ +Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul. \ No newline at end of file diff --git a/NLP/data/text3_processed.xml b/NLP/data/text3_processed.xml new file mode 100644 index 0000000..9935ae6 --- /dev/null +++ b/NLP/data/text3_processed.xml @@ -0,0 +1,5 @@ + + +SophiaLorenspuneeavafiîntotdeaunamulțumitoarefață_deBono. +ActrițaadezvăluitcântărețultrupeiU2aajutat-oselinișteascăatunci_cândeas-asperiatdeofurtunăîn_timp_cezburaucuavionul. + diff --git a/NLP/src/anaphora.py b/NLP/src/anaphora.py new file mode 100644 index 0000000..3e21300 --- /dev/null +++ b/NLP/src/anaphora.py @@ -0,0 +1,121 @@ +''' +Created on May 22, 2016 + +@author: tibi +''' +from model import Word + +def getGender(word): + + if word.isPronoun() and (word.pronounGetPerson() == '1' or word.pronounGetPerson() == '2'): + return 'n' + + return word.getGender() + +def genderMatch(word1, word2): + + g1 = getGender(word1) + g2 = getGender(word2) + + if g1 == g2: + return 2 + + if g1 == 'n' or g2 == 'n': + return 1 + + return 0 + +def isPrepositional(chunk): + + for word in chunk: + + if word.isPreposition(): + return True + + return False + +def countInText(noun, text): + + c = 0 + for word in text: + if word.text == noun.text: + c += 1 + + return c + +def anaphora(text, chunks): + + nounPhrases = [] + + for word in text: + + if word.isNoun(): + print("[n]", word) + nounPhrases.append((word, (word.sentenceIndex, word.chunk))) + + else: + print(word) + + if word.isPronoun(): + + candidates = [] + + for noun, chunkIndex in nounPhrases[:-30:-1]: + + # If gender and number match + if genderMatch(word, noun) > 0 and word.getNumber() == noun.getNumber(): + + npInd = genderMatch(word, noun) + + # definiteness + if not noun.nounIsDefinite(): + npInd -= 1 + + # non-prepositional noun phrase + chunk = chunks[chunkIndex] + if (isPrepositional(chunk)): + npInd -= 1 + + # first in sentence + if noun.sentenceIndex == 1: + npInd += 1 + + # indicating verbs + # todo... + + # lexical reiteration + c = countInText(noun, text) + if c == 2: + npInd += 1 + if c > 2: + npInd += 2 + + # noun is representing term + # how? + + # identical collocation pattern to the pronoun + # ??? + + # immediate reference, resolving 'it' + # applicable? + + # referential distance + dist = word.sentenceIndex - noun.sentenceIndex + if dist == 0: + npInd += 2 + elif dist == 1: + npInd += 1 + + candidates.append((noun, npInd)) + print("...> Candidate: {0} npInd = {1}".format(noun, npInd)) + + + if len(candidates) > 0: + + pickedWord, pickedInd = candidates[0] + for word, npInd in candidates: + if npInd > pickedInd: + pickedInd = npInd + pickedWord = word + + print(".>>> Picked: {0}".format(pickedWord)) diff --git a/NLP/src/fileparser.py b/NLP/src/fileparser.py new file mode 100644 index 0000000..7df5521 --- /dev/null +++ b/NLP/src/fileparser.py @@ -0,0 +1,55 @@ +''' +Created on May 22, 2016 + +@author: tibi +''' + +from xml.dom import minidom; +from model.Word import Word + +def parse(filename): + + words = [] + chunks = {} + + sentence_i = 0 + + # get the root "segs" element + dom = minidom.parse(filename) + alltext = dom.getElementsByTagName("segs") + + # iterate paragraphs + for paragraph in alltext[0].getElementsByTagName("seg"): + + # iterate sentences + for sentence in paragraph.getElementsByTagName("s"): + + # increment sentence index + sentence_i += 1 + word_i = 0 + + # iterate words + for word in sentence.getElementsByTagName("w"): + + # increment word index + word_i += 1 + + # obtain word info + wordText = word.firstChild.data + lemma = word.getAttribute("lemma") + ana = word.getAttribute("ana") + chunk = word.getAttribute("chunk") + + # create word + #w = Word(wordText, lemma, ana, chunk, sentence_i, word_i) + #words.append(w) + + for c in chunk.split(","): + w = Word(wordText, lemma, ana, c, sentence_i, word_i) + words.append(w) + if chunks.get((sentence_i, c)) == None: + chunks[(sentence_i, c)] = [ w ] + else: + chunks[(sentence_i, c)].append(w) + + return (words, chunks) diff --git a/NLP/src/main.py b/NLP/src/main.py new file mode 100644 index 0000000..afebe48 --- /dev/null +++ b/NLP/src/main.py @@ -0,0 +1,26 @@ +''' +Created on May 22, 2016 + +@author: tibi +''' +import fileparser +from anaphora import anaphora + +if __name__ == '__main__': + + words, chunks = fileparser.parse("../data/text3_processed.xml") + + print("Words:") + for word in words: + print("[{0} {1}] {2}".format(word.sentenceIndex, word.wordIndex, word)) + print("") + + print("Chunks:") + for key, value in chunks.items(): + print(key, ":") + for word in value: + print(" - ", word) + print("") + + print("Anaphora resolution:") + anaphora(words, chunks) diff --git a/NLP/src/model/Word.py b/NLP/src/model/Word.py new file mode 100644 index 0000000..52e9833 --- /dev/null +++ b/NLP/src/model/Word.py @@ -0,0 +1,88 @@ +''' +Created on May 22, 2016 + +@author: tibi +''' + +class Word: + + text = "" + lemma = "" + ana = "" + chunk = "" + + sentenceIndex = 0 + wordIndex = 0 + + def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex): + self.text = text + self.lemma = lemma + self.ana = ana + self.chunk = chunk + self.sentenceIndex = sentenceIndex + self.wordIndex = wordIndex + + def __str__(self): + return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk) + + def isNoun(self): + return self.ana[0] == "N" + + def nounIsCommon(self): + return self.isNoun() and self.ana[1] == "c" + + def nounIsProper(self): + return self.isNoun() and self.ana[1] == "p" + + def nounGetCase(self): + + if self.isNoun(): + return self.ana[4] + + return None + + 'Este articulat?' + def nounIsDefinite(self): + if self.isNoun(): + if (self.nounIsProper()): + return True + + if len(self.ana) > 5: + return self.ana[5] + + return "n" + + def pronounGetPerson(self): + if self.isPronoun(): + return self.ana[2] + + def getGender(self): + if self.isNoun(): + if (len(self.ana) >= 3): + return self.ana[2] + return 'n' + + if self.isPronoun(): + return self.ana[3] + + return None + + def getNumber(self): + if self.isNoun(): + if self.nounIsProper(): + return 's' + else: + return self.ana[3] + if self.isPronoun(): + return self.ana[4] + + return None + + def isPronoun(self): + return self.ana[0] == "P" + + def isVerb(self): + return self.ana[0] == "V" + + def isPreposition(self): + return self.ana[0] == "S" and self.ana[1] == "p" \ No newline at end of file diff --git a/NLP/src/model/__init__.py b/NLP/src/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data/index.csv b/data/index.csv new file mode 100644 index 0000000..825f152 --- /dev/null +++ b/data/index.csv @@ -0,0 +1 @@ +books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania \ No newline at end of file diff --git a/data/results.db b/data/results.db new file mode 100644 index 0000000..f0bae2b Binary files /dev/null and b/data/results.db differ diff --git a/data/texts.db b/data/texts.db new file mode 100644 index 0000000..ccafb68 Binary files /dev/null and b/data/texts.db differ diff --git a/data/texts.db.bak b/data/texts.db.bak new file mode 100644 index 0000000..d33689b Binary files /dev/null and b/data/texts.db.bak differ diff --git a/data/texts2.db b/data/texts2.db new file mode 100644 index 0000000..52f33ac Binary files /dev/null and b/data/texts2.db differ diff --git a/src/logger.py b/src/logger.py new file mode 100644 index 0000000..d544460 --- /dev/null +++ b/src/logger.py @@ -0,0 +1,20 @@ +import time +import logging + +def init_logger(level): + # Log filename + tm = time.strftime('%Y-%m-%d_%H-%M-%S') + logFile = "logs/log_{0}.log".format(tm) + + # Set up file logger + logging.basicConfig(filename=logFile, + level=logging.DEBUG, + format='%(asctime)s %(name)s %(levelname)s %(message)s', + datefmt='%m-%d %H:%M') + + # Set up console logger + formatter = logging.Formatter('[%(name)s] %(levelname)s: %(message)s') + console = logging.StreamHandler() + console.setLevel(level) + console.setFormatter(formatter) + logging.getLogger().addHandler(console) \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..6634a66 --- /dev/null +++ b/src/main.py @@ -0,0 +1,40 @@ +import logging +import time +# own +import logger +import storage.data +import storage.results +import textprocessor.letterfreq +import ttl.ttlparser +import ttl.ttlservice + +def init(): + logger.init_logger(logging.WARNING) + storage.data.initializeFragmentDatabase("data/texts.db") + storage.results.initializeResultsDatabase("data/results.db", True) + +def processTexts(): + count = storage.data.getTextCount() + current = 0 + for item in storage.data.getAllTexts(): + print("Processing item", current, "out of", count) + current = current + 1 + + itemid = item[0] + itemtext = item[1] + + # obtain ttl analysis + # unfeasable - it takes 5-10 minutes for a single text + # ttlResult = ttl.ttlservice.executeTtl(itemtext) + # (words, chunks) = ttl.ttlparser.parseText(ttlResult) + # storage.results.storeTtlAnalysis(itemid, words) + + # perform analysis + letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext) + storage.results.storeFrequencies(itemid, letterFreq) + + print("Finished!") + +init() +processTexts() + diff --git a/src/model.py b/src/model.py new file mode 100644 index 0000000..6522ad4 --- /dev/null +++ b/src/model.py @@ -0,0 +1,32 @@ +# Defines a fragment author +class Author: + def __init__(self, name = "", birthYear = "", location = "Romania"): + self.name = name + self.yearOfBirth = birthYear + self.location = location + + def __str__(self): + return self.name + + def __repr__(self): + return self.name + + def dump(self): + return "[Author name={0} yearOfBirth={1} location={2}]".format(self.name, self.yearOfBirth, self.location) + +# Defines a text fragment +class Fragment: + def __init__(self, title = "", text = "", author = Author(), year = 1999): + self.title = title + self.text = text + self.author = author + self.year = year + + def __str__(self): + return self.title + + def __repr__(self): + return self.title + + def dump(self): + return "[Fragment title={0} author={1} year={2} text={3}]".format(self.title, self.author.dump(), self.year, self.text) diff --git a/src/model/Word.py b/src/model/Word.py new file mode 100644 index 0000000..5d33ea9 --- /dev/null +++ b/src/model/Word.py @@ -0,0 +1,87 @@ + +# Defines a processed word +class Word: + + text = "" + lemma = "" + ana = "" + chunk = "" + + sentenceIndex = 0 + wordIndex = 0 + + def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex): + self.text = text + self.lemma = lemma + self.ana = ana + self.chunk = chunk + self.sentenceIndex = sentenceIndex + self.wordIndex = wordIndex + + def __str__(self): + return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk) + + def __repr__(self): + return str(self) + + def isNoun(self): + return self.ana[0] == "N" + + def nounIsCommon(self): + return self.isNoun() and self.ana[1] == "c" + + def nounIsProper(self): + return self.isNoun() and self.ana[1] == "p" + + def nounGetCase(self): + + if self.isNoun(): + return self.ana[4] + + return None + + 'Este articulat?' + def nounIsDefinite(self): + if self.isNoun(): + if (self.nounIsProper()): + return True + + if len(self.ana) > 5: + return self.ana[5] + + return "n" + + def pronounGetPerson(self): + if self.isPronoun(): + return self.ana[2] + + def getGender(self): + if self.isNoun(): + if (len(self.ana) >= 3): + return self.ana[2] + return 'n' + + if self.isPronoun(): + return self.ana[3] + + return None + + def getNumber(self): + if self.isNoun(): + if self.nounIsProper(): + return 's' + else: + return self.ana[3] + if self.isPronoun(): + return self.ana[4] + + return None + + def isPronoun(self): + return self.ana[0] == "P" + + def isVerb(self): + return self.ana[0] == "V" + + def isPreposition(self): + return self.ana[0] == "S" and self.ana[1] == "p" diff --git a/src/model/__init__.py b/src/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/storage/__init__.py b/src/storage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/storage/data.py b/src/storage/data.py new file mode 100644 index 0000000..a3a4bcd --- /dev/null +++ b/src/storage/data.py @@ -0,0 +1,80 @@ +import logging +import os +from model import * +import sqlite3 + +log = logging.getLogger("storage") + +DB_FRAGMENTS = "" + +# Commands + +# birth location - general area, not exact location (i.e. Transylvania) +# birth origin - rural or urban +# studies - masters, bachelors, high school, middle school, primary school +# occupation - comma separated if there are multiple +# studiesAbroad - foreign cities where author studied (comma separated) +COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors ( + name TEXT PRIMARY KEY, + birthYear INTEGER, + birthLocation TEXT, + birthOrigin TEXT, + studies TEXT, + occupations TEXT, + studiesAbroad TEXT + )""" + +# genre - short story (nuvela), novel (roman), poem etc +# movement - literary movement (submovements separated by /) (i.e. realism/naturalism) +# tags - other relevant information (i.e. psychological) +COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments ( + id INTEGER PRIMARY KEY, + title TEXT, + year INTEGER, + author TEXT REFERENCES Authors(name), + genre TEXT, + movement TEXT, + tags TEXT + )""" + +# contains the actual text +COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent ( + id INTEGER REFERENCES Fragments(id), + content TEXT + )""" + +# Initialize databases +def initializeFragmentDatabase(dbFile): + global DB_FRAGMENTS + DB_FRAGMENTS = dbFile + + if not os.path.exists(dbFile): + log.info("Text database %s not found. Will create database.", dbFile) + con = sqlite3.connect(dbFile) + c = con.cursor() + c.execute(COMMAND_CREATE_AUTHORS) + c.execute(COMMAND_CREATE_FRAGMENTS) + c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT) + con.commit() + con.close() + log.info("Database created!") + +def getTextCount(): + con = sqlite3.connect(DB_FRAGMENTS) + c = con.cursor() + c.execute("SELECT COUNT(*) FROM Fragments") + item = c.fetchone() + c.close() + con.close() + return item[0] + +def getAllTexts(): + con = sqlite3.connect(DB_FRAGMENTS) + c = con.cursor() + c.execute("SELECT id, content FROM FragmentsContent") + + items = c.fetchall() + + c.close() + con.close() + return items \ No newline at end of file diff --git a/src/storage/results.py b/src/storage/results.py new file mode 100644 index 0000000..6bba18b --- /dev/null +++ b/src/storage/results.py @@ -0,0 +1,84 @@ +import logging +import os +from model.Word import * +import sqlite3 + +log = logging.getLogger("storage") + +DB_RESULTS = "" + +COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies ( + idtext INTEGER, + lettergroup TEXT, + category TEXT, + frequency REAL + )""" + +COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords ( + idtext INTEGER, + wordIndex INTEGER, + sentenceIndex INTEGER, + word TEXT, + lemma TEXT, + analysis TEXT, + chunk TEXT + )""" + +# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram ( +# idtext INTEGER, +# wordlength INTEGER, +# frequency REAL +# )""" + +def initializeResultsDatabase(dbFile, cleanupOldData): + global DB_RESULTS + DB_RESULTS = dbFile + + # cleanup old data + if cleanupOldData: + con = sqlite3.connect(DB_RESULTS) + c = con.cursor() + + try: + c.execute("DROP TABLE LetterFrequencies") + except sqlite3.OperationalError: + pass + c.execute(COMMAND_CREATE_LETTER_FREQUENCIES) + + try: + c.execute("DROP TABLE TextWords") + except sqlite3.OperationalError: + pass + c.execute(COMMAND_CREATE_TEXT_WORDS) + + con.commit() + c.close() + con.close() + + +def storeFrequencies(idtext, freq): + con = sqlite3.connect(DB_RESULTS) + c = con.cursor() + + # add data + chr = ['p', 'l1', 'l2', 'l3'] + for i in range(4): + for let, fr in freq[i]: + c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr)) + + con.commit() + c.close() + con.close() + +def storeTtlAnalysis(idtext, words): + con = sqlite3.connect(DB_RESULTS) + c = con.cursor() + + # store words + for word in words: + c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk)) + + # finish + con.commit() + c.close() + con.close() \ No newline at end of file diff --git a/src/test.py b/src/test.py new file mode 100644 index 0000000..43a1da1 --- /dev/null +++ b/src/test.py @@ -0,0 +1,14 @@ +# coding: utf-8 +from ttl import ttlservice +from ttl import ttlparser +import nltk + +import storage + +data = storage.parseIndex("data") +print(data) + +#textXml = ttlservice.executeTtl(u"Numele meu este Tibi și îmi place să cânt la chitară bass. Ce faci?") +#words, chunks = ttlparser.parseText(textXml) +#print ("Words: ", words) +#print ("Chunks: ", chunks) \ No newline at end of file diff --git a/src/textprocessor/__init__.py b/src/textprocessor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/textprocessor/letterfreq.py b/src/textprocessor/letterfreq.py new file mode 100644 index 0000000..28f5fb2 --- /dev/null +++ b/src/textprocessor/letterfreq.py @@ -0,0 +1,38 @@ +import operator +import storage + +def letterFrequencies(text): + letterfreq = [{}, {}, {}, {}] + lettersum = [0, 0, 0, 0] + + n = len(text) + for i in range(n): + + # compute substring frequency + # l = substring length + for l in range(1, 4): + sub = text[i : i + l].lower() + if len(sub) == l and sub.isalnum(): + lettersum[l] += 1 + if not sub in letterfreq[l]: + letterfreq[l][sub] = 1 + else: + letterfreq[l][sub] += 1 + + # compute punctuation frequency + chr = text[i] + if not chr.isalnum() and not chr.isspace() and chr.isprintable(): + lettersum[0] += 1 + if not chr in letterfreq[0]: + letterfreq[0][chr] = 1 + else: + letterfreq[0][chr] += 1 + + # Almost done. Sort and remove irrelevant items (with low frequency), and normalize data + for i in range(4): + freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True) + freqFiltered = freqSorted[0:50] + freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered] + letterfreq[i] = freqNormalized + + return letterfreq \ No newline at end of file diff --git a/src/textprocessor/wordanalysis.py b/src/textprocessor/wordanalysis.py new file mode 100644 index 0000000..ac2af3a --- /dev/null +++ b/src/textprocessor/wordanalysis.py @@ -0,0 +1,2 @@ +def analyzeWords(text): + pass \ No newline at end of file diff --git a/src/tools/wikisource_downloader.py b/src/tools/wikisource_downloader.py new file mode 100644 index 0000000..2502ab7 --- /dev/null +++ b/src/tools/wikisource_downloader.py @@ -0,0 +1,128 @@ +import urllib +from pyquery import PyQuery +import sqlite3 +import re + +BASE_URL = "https://ro.wikisource.org" +LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + +def getAuthorList(): + authors = [] + for letter in LETTERS: + print("Processing link page for letter", letter) + # Read index page + url = BASE_URL + '/wiki/Categorie:Autori-' + letter + data = urllib.request.urlopen(url).read() + q = PyQuery(data) + for item in q("div.mw-category-generated").find("a"): + if (item.text.startswith("Autor:")): + authorname = item.text[6:] + authorlink = item.attrib['href'] + authors.append((authorname, authorlink)) + return list(set(authors)) + +def getAuthorWikiLink(query): + wikilink = None + body = query("div#mw-content-text") + table = body.find("table") + for link in table.find("a"): + if "ro.wikipedia.org" in link.attrib['href']: + wikilink = link.attrib['href'] + return wikilink + +def getAuthorLinksList(authorname, query): + links = [] + body = query("div#mw-content-text") + for link in body.find("a"): + address = link.attrib['href'] + ok = True + if "http" in address: + ok = False + if "redlink" in address: + ok = False + if "Fi%C8%99ier:" in address: + ok = False + if "index.php" in address: + ok = False + if address.startswith("#"): + ok = False + if "Autor:" in address: + ok = False + if ok: + links.append(link.attrib['href']) + return links + +def getAuthorBasicInfo(authorname, authorlink): + info = {} + data = urllib.request.urlopen(BASE_URL + authorlink).read() + q = PyQuery(data) + + info["wiki"] = getAuthorWikiLink(q) + info["links"] = getAuthorLinksList(authorname, q) + + return info + +# def getAuthorWikiInfo(authorinfo): + +# # Nothing can be learned without wiki page +# if authorinfo["wiki"] is None: +# return authorinfo + +# try: +# data = urllib.request.urlopen(authorinfo["wiki"]).read() +# q = PyQuery(data) + +# # Find the birth date +# body = q("#mw-content-text").text() +# result = re.compile(u"Născut\s+([\w\s]+)").match(body) +# if not result is None: +# authorinfo["birthyear"] = result.group(0) + +# except urllib.error.HTTPError: +# pass + +# return authorinfo + +def getText(url): + data = urllib.request.urlopen(BASE_URL + url).read() + q = PyQuery(data) + + texttitle = q("h1").text() + + body = q("#mw-content-text") + body.find("table").remove() + + textcontent = body.text() + return (texttitle, textcontent) + +def addAuthorToDb(authorinfo): + con = sqlite3.connect("data/texts.db") + c = con.cursor() + c.execute("INSERT INTO Authors") + +def getAllTexts(): + + con = sqlite3.connect("data/texts.db") + c = con.cursor() + #c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT") + id = 1 + + authors = getAuthorList() + for authorname, authorlink in authors: + print("Processing author", authorname) + authorinfo = getAuthorBasicInfo(authorname, authorlink) + c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"])) + + # authorinfo = getAuthorWikiInfo(authorinfo) + for text in authorinfo["links"]: + try: + title, content = getText(text) + c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname)) + c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content)) + id = id + 1 + except urllib.error.HTTPError: + continue + + con.commit() + +getAllTexts() \ No newline at end of file diff --git a/src/ttl/__init__.py b/src/ttl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ttl/ttlparser.py b/src/ttl/ttlparser.py new file mode 100644 index 0000000..59a5840 --- /dev/null +++ b/src/ttl/ttlparser.py @@ -0,0 +1,62 @@ +''' +Created on May 22, 2016 + +@author: tibi +''' + +from xml.dom import minidom; +from xml.parsers.expat import ExpatError +from model.Word import Word + +def parseText(xmlText): + + words = [] + chunks = {} + + sentence_i = 0 + + # get the root "segs" element + try: + dom = minidom.parseString(xmlText) + except ExpatError as e: + print("Error in text:", xmlText) + print(e) + exit(-1) + + alltext = dom.getElementsByTagName("segs") + + # iterate paragraphs + for paragraph in alltext[0].getElementsByTagName("seg"): + + # iterate sentences + for sentence in paragraph.getElementsByTagName("s"): + + # increment sentence index + sentence_i += 1 + word_i = 0 + + # iterate words + for word in sentence.getElementsByTagName("w"): + + # increment word index + word_i += 1 + + # obtain word info + wordText = word.firstChild.data + lemma = word.getAttribute("lemma") + ana = word.getAttribute("ana") + chunk = word.getAttribute("chunk") + + # create word + #w = Word(wordText, lemma, ana, chunk, sentence_i, word_i) + #words.append(w) + + for c in chunk.split(","): + w = Word(wordText, lemma, ana, c, sentence_i, word_i) + words.append(w) + if chunks.get((sentence_i, c)) == None: + chunks[(sentence_i, c)] = [ w ] + else: + chunks[(sentence_i, c)].append(w) + + return (words, chunks) diff --git a/src/ttl/ttlservice.py b/src/ttl/ttlservice.py new file mode 100644 index 0000000..a5208a5 --- /dev/null +++ b/src/ttl/ttlservice.py @@ -0,0 +1,34 @@ +# coding: utf-8 +import zeep + +def executeTtl(text): + # Preprocess the text + text = text.replace(u'ĭ', 'i') + text = text.replace(u'ŭ', 'u') + text = text.replace(u'à', 'a') + + client = zeep.Client("http://ws.racai.ro/ttlws.wsdl") + textSgml = client.service.UTF8toSGML(text) + result = client.service.XCES("ro", "id", textSgml) + + # Cleanup result - generate valid xml + result = result.replace('’', '`') + result = result.replace('ă', u'ă') + result = result.replace('à', u'à') + result = result.replace('â', u'â') + result = result.replace('î', u'î') + result = result.replace('ş', u'ș') + result = result.replace('ţ', u'ț') + result = result.replace('ŭ', u'u') + result = result.replace('Ă', u'Ă') + result = result.replace('À', u'À') + result = result.replace('Â', u'Â') + result = result.replace('Î', u'Î') + result = result.replace('Ş', u'Ș') + result = result.replace('Ţ', u'Ț') + result = result.replace('Ŭ', u'U') + + xmlResult = "" + xmlResult += result + xmlResult += "" + return xmlResult