Initial commit
This commit is contained in:
commit
6badfbd103
107
.gitignore
vendored
Normal file
107
.gitignore
vendored
Normal file
@ -0,0 +1,107 @@
|
||||
.vscode/*
|
||||
!.vscode/settings.json
|
||||
!.vscode/tasks.json
|
||||
!.vscode/launch.json
|
||||
!.vscode/extensions.json
|
||||
|
||||
logs/*
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
220
.vscode/launch.json
vendored
Normal file
220
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,220 @@
|
||||
{
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Run without debugging",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": false,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"program": "${file}",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Python",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": true,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"program": "${file}",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "PySpark",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": true,
|
||||
"osx": {
|
||||
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
|
||||
},
|
||||
"windows": {
|
||||
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd"
|
||||
},
|
||||
"linux": {
|
||||
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
|
||||
},
|
||||
"program": "${file}",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Python Module",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": true,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"module": "module.name",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Integrated Terminal/Console",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": true,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"program": "${file}",
|
||||
"cwd": "",
|
||||
"console": "integratedTerminal",
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "External Terminal/Console",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": true,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"program": "${file}",
|
||||
"cwd": "",
|
||||
"console": "externalTerminal",
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Django",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": true,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"program": "${workspaceRoot}/manage.py",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"args": [
|
||||
"runserver",
|
||||
"--noreload"
|
||||
],
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput",
|
||||
"DjangoDebugging"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Flask",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": false,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"program": "fully qualified path fo 'flask' executable. Generally located along with python interpreter",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"env": {
|
||||
"FLASK_APP": "${workspaceRoot}/quickstart/app.py"
|
||||
},
|
||||
"args": [
|
||||
"run",
|
||||
"--no-debugger",
|
||||
"--no-reload"
|
||||
],
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Flask (old)",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": false,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"program": "${workspaceRoot}/run.py",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"args": [],
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Pyramid",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": true,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"args": [
|
||||
"${workspaceRoot}/development.ini"
|
||||
],
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput",
|
||||
"Pyramid"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Watson",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"stopOnEntry": true,
|
||||
"pythonPath": "${config:python.pythonPath}",
|
||||
"program": "${workspaceRoot}/console.py",
|
||||
"cwd": "${workspaceRoot}",
|
||||
"args": [
|
||||
"dev",
|
||||
"runserver",
|
||||
"--noreload=True"
|
||||
],
|
||||
"env": {},
|
||||
"envFile": "${workspaceRoot}/.env",
|
||||
"debugOptions": [
|
||||
"WaitOnAbnormalExit",
|
||||
"WaitOnNormalExit",
|
||||
"RedirectOutput"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Attach (Remote Debug)",
|
||||
"type": "python",
|
||||
"request": "attach",
|
||||
"localRoot": "${workspaceRoot}",
|
||||
"remoteRoot": "${workspaceRoot}",
|
||||
"port": 3000,
|
||||
"secret": "my_secret",
|
||||
"host": "localhost"
|
||||
}
|
||||
]
|
||||
}
|
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
// Place your settings in this file to overwrite default and user settings.
|
||||
{
|
||||
}
|
17
NLP/.project
Normal file
17
NLP/.project
Normal file
@ -0,0 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>NLP</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.python.pydev.PyDevBuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.python.pydev.pythonNature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
8
NLP/.pydevproject
Normal file
8
NLP/.pydevproject
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?eclipse-pydev version="1.0"?><pydev_project>
|
||||
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
|
||||
<path>/${PROJECT_DIR_NAME}/src</path>
|
||||
</pydev_pathproperty>
|
||||
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 3.0</pydev_property>
|
||||
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 3.5</pydev_property>
|
||||
</pydev_project>
|
1
NLP/data/text1.txt
Normal file
1
NLP/data/text1.txt
Normal file
@ -0,0 +1 @@
|
||||
Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase.
|
6
NLP/data/text1_processed.xml
Normal file
6
NLP/data/text1_processed.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<segs>
|
||||
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Maria" ana="Np" chunk="Np#1">Maria</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="măr" ana="Ncfp-n" chunk="Np#2">mere</w><c>.</c></s></seg>
|
||||
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="el" ana="Pp3fsr--------s">Ea</w><w lemma="mai" ana="Rp" chunk="Ap#1">mai</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="șapte" ana="Mc-p-l" chunk="Np#1">șapte</w><w lemma="pară" ana="Ncfp-n" chunk="Np#1">pere</w><c>.</c></s></seg>
|
||||
<seg lang="ro"><s id="id_temp_aiurea.3"><w lemma="acesta" ana="Pd3fpr">Acestea</w><w lemma="fi" ana="Vmip3p" chunk="Vp#1">sunt</w><w lemma="foarte" ana="Rp" chunk="Ap#1,Vp#1">foarte</w><w lemma="delicios" ana="Afpfp-n" chunk="Ap#1">delicioase</w><c>.</c></s></seg>
|
||||
</segs>
|
1
NLP/data/text2.txt
Normal file
1
NLP/data/text2.txt
Normal file
@ -0,0 +1 @@
|
||||
Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat.
|
5
NLP/data/text2_processed.xml
Normal file
5
NLP/data/text2_processed.xml
Normal file
@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<segs>
|
||||
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sabeer" ana="Np" chunk="Np#1">Sabeer</w><w lemma="Bhatia" ana="Np" chunk="Np#1">Bhatia</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="ajunge" ana="Vmp--sm" chunk="Vp#1">ajuns</w><w lemma="la" ana="Spsa" chunk="Pp#1">la</w><w lemma="aeroport" ana="Ncmsry" chunk="Pp#1,Np#2">Aeroportul</w><w lemma="internațional" ana="Afpms-n" chunk="Pp#1,Np#2,Ap#1">Internațional</w><w lemma="din" ana="Spsa" chunk="Pp#2">din</w><w lemma="Los" ana="Np" chunk="Pp#2,Np#3">Los</w><w lemma="Angeles" ana="Np" chunk="Pp#2,Np#3">Angeles</w><w lemma="la" ana="Spsa" chunk="Pp#3">la</w><w lemma="oră" ana="Ncfsry" chunk="Pp#3,Np#4">ora</w><w lemma="18" ana="Mc" chunk="Pp#3,Np#4">18</w><w lemma="în" ana="Spsa" chunk="Pp#4">în</w><w lemma="dată" ana="Ncfsry" chunk="Pp#4,Np#5">data</w><w lemma="de" ana="Spsa" chunk="Pp#5">de</w><w lemma="23" ana="Mc" chunk="Pp#5,Np#6">23</w><w lemma="septembrie" ana="Ncms-n" chunk="Pp#5,Np#6">septembrie</w><w lemma="1998" ana="Mc" chunk="Pp#5,Np#6">1998</w><c>.</c></s></seg>
|
||||
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="zbor" ana="Ncmsry" chunk="Np#1">Zborul</w><w lemma="său" ana="Ds3ms-s" chunk="Np#1">său</w><w lemma="din" ana="Spsa" chunk="Pp#1">din</w><w lemma="Bangalore" ana="Np" chunk="Pp#1,Np#2">Bangalore</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dura" ana="Vmp--sm" chunk="Vp#1">durat</w><w lemma="22" ana="Mc" chunk="Np#3">22</w><w lemma="oră" ana="Ncfp-n" chunk="Np#3">ore</w><c>,</c><w lemma="și" ana="Crssp">și</w><w lemma="el" ana="Pp3msr--------s" chunk="Vp#2">el</w><w lemma="fi" ana="Vaii3s" chunk="Vp#2">era</w><w lemma="înfometa" ana="Vmp--sm" chunk="Vp#2">înfometat</w><c>.</c></s></seg>
|
||||
</segs>
|
1
NLP/data/text3.txt
Normal file
1
NLP/data/text3.txt
Normal file
@ -0,0 +1 @@
|
||||
Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul.
|
5
NLP/data/text3_processed.xml
Normal file
5
NLP/data/text3_processed.xml
Normal file
@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<segs>
|
||||
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sophia" ana="Np" chunk="Np#1">Sophia</w><w lemma="Loren" ana="Np" chunk="Np#1">Loren</w><w lemma="spune" ana="Vmnp" chunk="Vp#1">spune</w><w lemma="că" ana="Csssp">că</w><w lemma="el" ana="Pp3fsr--------s" chunk="Vp#2">ea</w><w lemma="vrea" ana="Va--3s" chunk="Vp#2">va</w><w lemma="fi" ana="Vmnp" chunk="Vp#2">fi</w><w lemma="întotdeauna" ana="Rgp" chunk="Vp#2,Ap#1">întotdeauna</w><w lemma="mulțumitor" ana="Afpf--n" chunk="Ap#1">mulțumitoare</w><w lemma="față_de" ana="Spca" chunk="Pp#1">față_de</w><w lemma="bonă" ana="Ncfsvy" chunk="Pp#1,Np#2">Bono</w><c>.</c></s></seg>
|
||||
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="actriță" ana="Ncfsry" chunk="Np#1">Actrița</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dezvălui" ana="Vmp--sm" chunk="Vp#1">dezvăluit</w><w lemma="că" ana="Csssp">că</w><w lemma="cântăreț" ana="Ncmsry" chunk="Np#2">cântărețul</w><w lemma="trupă" ana="Ncfsoy" chunk="Np#2">trupei</w><w lemma="U2" ana="Np" chunk="Np#2">U2</w><w lemma="avea" ana="Va--3s" chunk="Vp#2">a</w><w lemma="ajuta" ana="Vmp--sm" chunk="Vp#2">ajutat</w><w lemma="el" ana="Pp3fsa--y-----w">-o</w><w lemma="să" ana="Qs" chunk="Vp#3">să</w><w lemma="sine" ana="Px3--a--------w" chunk="Vp#3">se</w><w lemma="liniști" ana="Vmsp3" chunk="Vp#3">liniștească</w><w lemma="atunci_când" ana="Rw" chunk="Vp#3,Ap#1">atunci_când</w><w lemma="el" ana="Pp3fsr--------s">ea</w><w lemma="sine" ana="Px3--a--y-----w" chunk="Vp#4">s-</w><w lemma="avea" ana="Va--3s" chunk="Vp#4">a</w><w lemma="speria" ana="Vmp--sm" chunk="Vp#4">speriat</w><w lemma="de" ana="Spsa" chunk="Pp#1">de</w><w lemma="un" ana="Tifsr" chunk="Pp#1,Np#3">o</w><w lemma="furtună" ana="Ncfsrn" chunk="Pp#1,Np#3">furtună</w><w lemma="în_timp_ce" ana="Cscsp">în_timp_ce</w><w lemma="zbura" ana="Vmii3p" chunk="Vp#5">zburau</w><w lemma="cu" ana="Spsa" chunk="Pp#2">cu</w><w lemma="avion" ana="Ncmsry" chunk="Pp#2,Np#4">avionul</w><c>.</c></s></seg>
|
||||
</segs>
|
121
NLP/src/anaphora.py
Normal file
121
NLP/src/anaphora.py
Normal file
@ -0,0 +1,121 @@
|
||||
'''
|
||||
Created on May 22, 2016
|
||||
|
||||
@author: tibi
|
||||
'''
|
||||
from model import Word
|
||||
|
||||
def getGender(word):
|
||||
|
||||
if word.isPronoun() and (word.pronounGetPerson() == '1' or word.pronounGetPerson() == '2'):
|
||||
return 'n'
|
||||
|
||||
return word.getGender()
|
||||
|
||||
def genderMatch(word1, word2):
|
||||
|
||||
g1 = getGender(word1)
|
||||
g2 = getGender(word2)
|
||||
|
||||
if g1 == g2:
|
||||
return 2
|
||||
|
||||
if g1 == 'n' or g2 == 'n':
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
def isPrepositional(chunk):
|
||||
|
||||
for word in chunk:
|
||||
|
||||
if word.isPreposition():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def countInText(noun, text):
|
||||
|
||||
c = 0
|
||||
for word in text:
|
||||
if word.text == noun.text:
|
||||
c += 1
|
||||
|
||||
return c
|
||||
|
||||
def anaphora(text, chunks):
|
||||
|
||||
nounPhrases = []
|
||||
|
||||
for word in text:
|
||||
|
||||
if word.isNoun():
|
||||
print("[n]", word)
|
||||
nounPhrases.append((word, (word.sentenceIndex, word.chunk)))
|
||||
|
||||
else:
|
||||
print(word)
|
||||
|
||||
if word.isPronoun():
|
||||
|
||||
candidates = []
|
||||
|
||||
for noun, chunkIndex in nounPhrases[:-30:-1]:
|
||||
|
||||
# If gender and number match
|
||||
if genderMatch(word, noun) > 0 and word.getNumber() == noun.getNumber():
|
||||
|
||||
npInd = genderMatch(word, noun)
|
||||
|
||||
# definiteness
|
||||
if not noun.nounIsDefinite():
|
||||
npInd -= 1
|
||||
|
||||
# non-prepositional noun phrase
|
||||
chunk = chunks[chunkIndex]
|
||||
if (isPrepositional(chunk)):
|
||||
npInd -= 1
|
||||
|
||||
# first in sentence
|
||||
if noun.sentenceIndex == 1:
|
||||
npInd += 1
|
||||
|
||||
# indicating verbs
|
||||
# todo...
|
||||
|
||||
# lexical reiteration
|
||||
c = countInText(noun, text)
|
||||
if c == 2:
|
||||
npInd += 1
|
||||
if c > 2:
|
||||
npInd += 2
|
||||
|
||||
# noun is representing term
|
||||
# how?
|
||||
|
||||
# identical collocation pattern to the pronoun
|
||||
# ???
|
||||
|
||||
# immediate reference, resolving 'it'
|
||||
# applicable?
|
||||
|
||||
# referential distance
|
||||
dist = word.sentenceIndex - noun.sentenceIndex
|
||||
if dist == 0:
|
||||
npInd += 2
|
||||
elif dist == 1:
|
||||
npInd += 1
|
||||
|
||||
candidates.append((noun, npInd))
|
||||
print("...> Candidate: {0} npInd = {1}".format(noun, npInd))
|
||||
|
||||
|
||||
if len(candidates) > 0:
|
||||
|
||||
pickedWord, pickedInd = candidates[0]
|
||||
for word, npInd in candidates:
|
||||
if npInd > pickedInd:
|
||||
pickedInd = npInd
|
||||
pickedWord = word
|
||||
|
||||
print(".>>> Picked: {0}".format(pickedWord))
|
55
NLP/src/fileparser.py
Normal file
55
NLP/src/fileparser.py
Normal file
@ -0,0 +1,55 @@
|
||||
'''
|
||||
Created on May 22, 2016
|
||||
|
||||
@author: tibi
|
||||
'''
|
||||
|
||||
from xml.dom import minidom;
|
||||
from model.Word import Word
|
||||
|
||||
def parse(filename):
|
||||
|
||||
words = []
|
||||
chunks = {}
|
||||
|
||||
sentence_i = 0
|
||||
|
||||
# get the root "segs" element
|
||||
dom = minidom.parse(filename)
|
||||
alltext = dom.getElementsByTagName("segs")
|
||||
|
||||
# iterate paragraphs
|
||||
for paragraph in alltext[0].getElementsByTagName("seg"):
|
||||
|
||||
# iterate sentences
|
||||
for sentence in paragraph.getElementsByTagName("s"):
|
||||
|
||||
# increment sentence index
|
||||
sentence_i += 1
|
||||
word_i = 0
|
||||
|
||||
# iterate words
|
||||
for word in sentence.getElementsByTagName("w"):
|
||||
|
||||
# increment word index
|
||||
word_i += 1
|
||||
|
||||
# obtain word info
|
||||
wordText = word.firstChild.data
|
||||
lemma = word.getAttribute("lemma")
|
||||
ana = word.getAttribute("ana")
|
||||
chunk = word.getAttribute("chunk")
|
||||
|
||||
# create word
|
||||
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
|
||||
#words.append(w)
|
||||
|
||||
for c in chunk.split(","):
|
||||
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
|
||||
words.append(w)
|
||||
if chunks.get((sentence_i, c)) == None:
|
||||
chunks[(sentence_i, c)] = [ w ]
|
||||
else:
|
||||
chunks[(sentence_i, c)].append(w)
|
||||
|
||||
return (words, chunks)
|
26
NLP/src/main.py
Normal file
26
NLP/src/main.py
Normal file
@ -0,0 +1,26 @@
|
||||
'''
|
||||
Created on May 22, 2016
|
||||
|
||||
@author: tibi
|
||||
'''
|
||||
import fileparser
|
||||
from anaphora import anaphora
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
words, chunks = fileparser.parse("../data/text3_processed.xml")
|
||||
|
||||
print("Words:")
|
||||
for word in words:
|
||||
print("[{0} {1}] {2}".format(word.sentenceIndex, word.wordIndex, word))
|
||||
print("")
|
||||
|
||||
print("Chunks:")
|
||||
for key, value in chunks.items():
|
||||
print(key, ":")
|
||||
for word in value:
|
||||
print(" - ", word)
|
||||
print("")
|
||||
|
||||
print("Anaphora resolution:")
|
||||
anaphora(words, chunks)
|
88
NLP/src/model/Word.py
Normal file
88
NLP/src/model/Word.py
Normal file
@ -0,0 +1,88 @@
|
||||
'''
|
||||
Created on May 22, 2016
|
||||
|
||||
@author: tibi
|
||||
'''
|
||||
|
||||
class Word:
|
||||
|
||||
text = ""
|
||||
lemma = ""
|
||||
ana = ""
|
||||
chunk = ""
|
||||
|
||||
sentenceIndex = 0
|
||||
wordIndex = 0
|
||||
|
||||
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
|
||||
self.text = text
|
||||
self.lemma = lemma
|
||||
self.ana = ana
|
||||
self.chunk = chunk
|
||||
self.sentenceIndex = sentenceIndex
|
||||
self.wordIndex = wordIndex
|
||||
|
||||
def __str__(self):
|
||||
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
|
||||
|
||||
def isNoun(self):
|
||||
return self.ana[0] == "N"
|
||||
|
||||
def nounIsCommon(self):
|
||||
return self.isNoun() and self.ana[1] == "c"
|
||||
|
||||
def nounIsProper(self):
|
||||
return self.isNoun() and self.ana[1] == "p"
|
||||
|
||||
def nounGetCase(self):
|
||||
|
||||
if self.isNoun():
|
||||
return self.ana[4]
|
||||
|
||||
return None
|
||||
|
||||
'Este articulat?'
|
||||
def nounIsDefinite(self):
|
||||
if self.isNoun():
|
||||
if (self.nounIsProper()):
|
||||
return True
|
||||
|
||||
if len(self.ana) > 5:
|
||||
return self.ana[5]
|
||||
|
||||
return "n"
|
||||
|
||||
def pronounGetPerson(self):
|
||||
if self.isPronoun():
|
||||
return self.ana[2]
|
||||
|
||||
def getGender(self):
|
||||
if self.isNoun():
|
||||
if (len(self.ana) >= 3):
|
||||
return self.ana[2]
|
||||
return 'n'
|
||||
|
||||
if self.isPronoun():
|
||||
return self.ana[3]
|
||||
|
||||
return None
|
||||
|
||||
def getNumber(self):
|
||||
if self.isNoun():
|
||||
if self.nounIsProper():
|
||||
return 's'
|
||||
else:
|
||||
return self.ana[3]
|
||||
if self.isPronoun():
|
||||
return self.ana[4]
|
||||
|
||||
return None
|
||||
|
||||
def isPronoun(self):
|
||||
return self.ana[0] == "P"
|
||||
|
||||
def isVerb(self):
|
||||
return self.ana[0] == "V"
|
||||
|
||||
def isPreposition(self):
|
||||
return self.ana[0] == "S" and self.ana[1] == "p"
|
0
NLP/src/model/__init__.py
Normal file
0
NLP/src/model/__init__.py
Normal file
1
data/index.csv
Normal file
1
data/index.csv
Normal file
@ -0,0 +1 @@
|
||||
books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania
|
|
BIN
data/results.db
Normal file
BIN
data/results.db
Normal file
Binary file not shown.
BIN
data/texts.db
Normal file
BIN
data/texts.db
Normal file
Binary file not shown.
BIN
data/texts.db.bak
Normal file
BIN
data/texts.db.bak
Normal file
Binary file not shown.
BIN
data/texts2.db
Normal file
BIN
data/texts2.db
Normal file
Binary file not shown.
20
src/logger.py
Normal file
20
src/logger.py
Normal file
@ -0,0 +1,20 @@
|
||||
import time
|
||||
import logging
|
||||
|
||||
def init_logger(level):
|
||||
# Log filename
|
||||
tm = time.strftime('%Y-%m-%d_%H-%M-%S')
|
||||
logFile = "logs/log_{0}.log".format(tm)
|
||||
|
||||
# Set up file logger
|
||||
logging.basicConfig(filename=logFile,
|
||||
level=logging.DEBUG,
|
||||
format='%(asctime)s %(name)s %(levelname)s %(message)s',
|
||||
datefmt='%m-%d %H:%M')
|
||||
|
||||
# Set up console logger
|
||||
formatter = logging.Formatter('[%(name)s] %(levelname)s: %(message)s')
|
||||
console = logging.StreamHandler()
|
||||
console.setLevel(level)
|
||||
console.setFormatter(formatter)
|
||||
logging.getLogger().addHandler(console)
|
40
src/main.py
Normal file
40
src/main.py
Normal file
@ -0,0 +1,40 @@
|
||||
import logging
|
||||
import time
|
||||
# own
|
||||
import logger
|
||||
import storage.data
|
||||
import storage.results
|
||||
import textprocessor.letterfreq
|
||||
import ttl.ttlparser
|
||||
import ttl.ttlservice
|
||||
|
||||
def init():
|
||||
logger.init_logger(logging.WARNING)
|
||||
storage.data.initializeFragmentDatabase("data/texts.db")
|
||||
storage.results.initializeResultsDatabase("data/results.db", True)
|
||||
|
||||
def processTexts():
|
||||
count = storage.data.getTextCount()
|
||||
current = 0
|
||||
for item in storage.data.getAllTexts():
|
||||
print("Processing item", current, "out of", count)
|
||||
current = current + 1
|
||||
|
||||
itemid = item[0]
|
||||
itemtext = item[1]
|
||||
|
||||
# obtain ttl analysis
|
||||
# unfeasable - it takes 5-10 minutes for a single text
|
||||
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
|
||||
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
|
||||
# storage.results.storeTtlAnalysis(itemid, words)
|
||||
|
||||
# perform analysis
|
||||
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
|
||||
storage.results.storeFrequencies(itemid, letterFreq)
|
||||
|
||||
print("Finished!")
|
||||
|
||||
init()
|
||||
processTexts()
|
||||
|
32
src/model.py
Normal file
32
src/model.py
Normal file
@ -0,0 +1,32 @@
|
||||
# Defines a fragment author
|
||||
class Author:
|
||||
def __init__(self, name = "", birthYear = "", location = "Romania"):
|
||||
self.name = name
|
||||
self.yearOfBirth = birthYear
|
||||
self.location = location
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
def dump(self):
|
||||
return "[Author name={0} yearOfBirth={1} location={2}]".format(self.name, self.yearOfBirth, self.location)
|
||||
|
||||
# Defines a text fragment
|
||||
class Fragment:
|
||||
def __init__(self, title = "", text = "", author = Author(), year = 1999):
|
||||
self.title = title
|
||||
self.text = text
|
||||
self.author = author
|
||||
self.year = year
|
||||
|
||||
def __str__(self):
|
||||
return self.title
|
||||
|
||||
def __repr__(self):
|
||||
return self.title
|
||||
|
||||
def dump(self):
|
||||
return "[Fragment title={0} author={1} year={2} text={3}]".format(self.title, self.author.dump(), self.year, self.text)
|
87
src/model/Word.py
Normal file
87
src/model/Word.py
Normal file
@ -0,0 +1,87 @@
|
||||
|
||||
# Defines a processed word
|
||||
class Word:
|
||||
|
||||
text = ""
|
||||
lemma = ""
|
||||
ana = ""
|
||||
chunk = ""
|
||||
|
||||
sentenceIndex = 0
|
||||
wordIndex = 0
|
||||
|
||||
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
|
||||
self.text = text
|
||||
self.lemma = lemma
|
||||
self.ana = ana
|
||||
self.chunk = chunk
|
||||
self.sentenceIndex = sentenceIndex
|
||||
self.wordIndex = wordIndex
|
||||
|
||||
def __str__(self):
|
||||
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
def isNoun(self):
|
||||
return self.ana[0] == "N"
|
||||
|
||||
def nounIsCommon(self):
|
||||
return self.isNoun() and self.ana[1] == "c"
|
||||
|
||||
def nounIsProper(self):
|
||||
return self.isNoun() and self.ana[1] == "p"
|
||||
|
||||
def nounGetCase(self):
|
||||
|
||||
if self.isNoun():
|
||||
return self.ana[4]
|
||||
|
||||
return None
|
||||
|
||||
'Este articulat?'
|
||||
def nounIsDefinite(self):
|
||||
if self.isNoun():
|
||||
if (self.nounIsProper()):
|
||||
return True
|
||||
|
||||
if len(self.ana) > 5:
|
||||
return self.ana[5]
|
||||
|
||||
return "n"
|
||||
|
||||
def pronounGetPerson(self):
|
||||
if self.isPronoun():
|
||||
return self.ana[2]
|
||||
|
||||
def getGender(self):
|
||||
if self.isNoun():
|
||||
if (len(self.ana) >= 3):
|
||||
return self.ana[2]
|
||||
return 'n'
|
||||
|
||||
if self.isPronoun():
|
||||
return self.ana[3]
|
||||
|
||||
return None
|
||||
|
||||
def getNumber(self):
|
||||
if self.isNoun():
|
||||
if self.nounIsProper():
|
||||
return 's'
|
||||
else:
|
||||
return self.ana[3]
|
||||
if self.isPronoun():
|
||||
return self.ana[4]
|
||||
|
||||
return None
|
||||
|
||||
def isPronoun(self):
|
||||
return self.ana[0] == "P"
|
||||
|
||||
def isVerb(self):
|
||||
return self.ana[0] == "V"
|
||||
|
||||
def isPreposition(self):
|
||||
return self.ana[0] == "S" and self.ana[1] == "p"
|
0
src/model/__init__.py
Normal file
0
src/model/__init__.py
Normal file
0
src/storage/__init__.py
Normal file
0
src/storage/__init__.py
Normal file
80
src/storage/data.py
Normal file
80
src/storage/data.py
Normal file
@ -0,0 +1,80 @@
|
||||
import logging
|
||||
import os
|
||||
from model import *
|
||||
import sqlite3
|
||||
|
||||
log = logging.getLogger("storage")
|
||||
|
||||
DB_FRAGMENTS = ""
|
||||
|
||||
# Commands
|
||||
|
||||
# birth location - general area, not exact location (i.e. Transylvania)
|
||||
# birth origin - rural or urban
|
||||
# studies - masters, bachelors, high school, middle school, primary school
|
||||
# occupation - comma separated if there are multiple
|
||||
# studiesAbroad - foreign cities where author studied (comma separated)
|
||||
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
|
||||
name TEXT PRIMARY KEY,
|
||||
birthYear INTEGER,
|
||||
birthLocation TEXT,
|
||||
birthOrigin TEXT,
|
||||
studies TEXT,
|
||||
occupations TEXT,
|
||||
studiesAbroad TEXT
|
||||
)"""
|
||||
|
||||
# genre - short story (nuvela), novel (roman), poem etc
|
||||
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
|
||||
# tags - other relevant information (i.e. psychological)
|
||||
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
|
||||
id INTEGER PRIMARY KEY,
|
||||
title TEXT,
|
||||
year INTEGER,
|
||||
author TEXT REFERENCES Authors(name),
|
||||
genre TEXT,
|
||||
movement TEXT,
|
||||
tags TEXT
|
||||
)"""
|
||||
|
||||
# contains the actual text
|
||||
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
|
||||
id INTEGER REFERENCES Fragments(id),
|
||||
content TEXT
|
||||
)"""
|
||||
|
||||
# Initialize databases
|
||||
def initializeFragmentDatabase(dbFile):
|
||||
global DB_FRAGMENTS
|
||||
DB_FRAGMENTS = dbFile
|
||||
|
||||
if not os.path.exists(dbFile):
|
||||
log.info("Text database %s not found. Will create database.", dbFile)
|
||||
con = sqlite3.connect(dbFile)
|
||||
c = con.cursor()
|
||||
c.execute(COMMAND_CREATE_AUTHORS)
|
||||
c.execute(COMMAND_CREATE_FRAGMENTS)
|
||||
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
|
||||
con.commit()
|
||||
con.close()
|
||||
log.info("Database created!")
|
||||
|
||||
def getTextCount():
|
||||
con = sqlite3.connect(DB_FRAGMENTS)
|
||||
c = con.cursor()
|
||||
c.execute("SELECT COUNT(*) FROM Fragments")
|
||||
item = c.fetchone()
|
||||
c.close()
|
||||
con.close()
|
||||
return item[0]
|
||||
|
||||
def getAllTexts():
|
||||
con = sqlite3.connect(DB_FRAGMENTS)
|
||||
c = con.cursor()
|
||||
c.execute("SELECT id, content FROM FragmentsContent")
|
||||
|
||||
items = c.fetchall()
|
||||
|
||||
c.close()
|
||||
con.close()
|
||||
return items
|
84
src/storage/results.py
Normal file
84
src/storage/results.py
Normal file
@ -0,0 +1,84 @@
|
||||
import logging
|
||||
import os
|
||||
from model.Word import *
|
||||
import sqlite3
|
||||
|
||||
log = logging.getLogger("storage")
|
||||
|
||||
DB_RESULTS = ""
|
||||
|
||||
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
|
||||
idtext INTEGER,
|
||||
lettergroup TEXT,
|
||||
category TEXT,
|
||||
frequency REAL
|
||||
)"""
|
||||
|
||||
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
|
||||
idtext INTEGER,
|
||||
wordIndex INTEGER,
|
||||
sentenceIndex INTEGER,
|
||||
word TEXT,
|
||||
lemma TEXT,
|
||||
analysis TEXT,
|
||||
chunk TEXT
|
||||
)"""
|
||||
|
||||
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
|
||||
# idtext INTEGER,
|
||||
# wordlength INTEGER,
|
||||
# frequency REAL
|
||||
# )"""
|
||||
|
||||
def initializeResultsDatabase(dbFile, cleanupOldData):
|
||||
global DB_RESULTS
|
||||
DB_RESULTS = dbFile
|
||||
|
||||
# cleanup old data
|
||||
if cleanupOldData:
|
||||
con = sqlite3.connect(DB_RESULTS)
|
||||
c = con.cursor()
|
||||
|
||||
try:
|
||||
c.execute("DROP TABLE LetterFrequencies")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
|
||||
|
||||
try:
|
||||
c.execute("DROP TABLE TextWords")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
c.execute(COMMAND_CREATE_TEXT_WORDS)
|
||||
|
||||
con.commit()
|
||||
c.close()
|
||||
con.close()
|
||||
|
||||
|
||||
def storeFrequencies(idtext, freq):
|
||||
con = sqlite3.connect(DB_RESULTS)
|
||||
c = con.cursor()
|
||||
|
||||
# add data
|
||||
chr = ['p', 'l1', 'l2', 'l3']
|
||||
for i in range(4):
|
||||
for let, fr in freq[i]:
|
||||
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
|
||||
|
||||
con.commit()
|
||||
c.close()
|
||||
con.close()
|
||||
|
||||
def storeTtlAnalysis(idtext, words):
|
||||
con = sqlite3.connect(DB_RESULTS)
|
||||
c = con.cursor()
|
||||
|
||||
# store words
|
||||
for word in words:
|
||||
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
|
||||
|
||||
# finish
|
||||
con.commit()
|
||||
c.close()
|
||||
con.close()
|
14
src/test.py
Normal file
14
src/test.py
Normal file
@ -0,0 +1,14 @@
|
||||
# coding: utf-8
|
||||
from ttl import ttlservice
|
||||
from ttl import ttlparser
|
||||
import nltk
|
||||
|
||||
import storage
|
||||
|
||||
data = storage.parseIndex("data")
|
||||
print(data)
|
||||
|
||||
#textXml = ttlservice.executeTtl(u"Numele meu este Tibi și îmi place să cânt la chitară bass. Ce faci?")
|
||||
#words, chunks = ttlparser.parseText(textXml)
|
||||
#print ("Words: ", words)
|
||||
#print ("Chunks: ", chunks)
|
0
src/textprocessor/__init__.py
Normal file
0
src/textprocessor/__init__.py
Normal file
38
src/textprocessor/letterfreq.py
Normal file
38
src/textprocessor/letterfreq.py
Normal file
@ -0,0 +1,38 @@
|
||||
import operator
|
||||
import storage
|
||||
|
||||
def letterFrequencies(text):
|
||||
letterfreq = [{}, {}, {}, {}]
|
||||
lettersum = [0, 0, 0, 0]
|
||||
|
||||
n = len(text)
|
||||
for i in range(n):
|
||||
|
||||
# compute substring frequency
|
||||
# l = substring length
|
||||
for l in range(1, 4):
|
||||
sub = text[i : i + l].lower()
|
||||
if len(sub) == l and sub.isalnum():
|
||||
lettersum[l] += 1
|
||||
if not sub in letterfreq[l]:
|
||||
letterfreq[l][sub] = 1
|
||||
else:
|
||||
letterfreq[l][sub] += 1
|
||||
|
||||
# compute punctuation frequency
|
||||
chr = text[i]
|
||||
if not chr.isalnum() and not chr.isspace() and chr.isprintable():
|
||||
lettersum[0] += 1
|
||||
if not chr in letterfreq[0]:
|
||||
letterfreq[0][chr] = 1
|
||||
else:
|
||||
letterfreq[0][chr] += 1
|
||||
|
||||
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
|
||||
for i in range(4):
|
||||
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
|
||||
freqFiltered = freqSorted[0:50]
|
||||
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
|
||||
letterfreq[i] = freqNormalized
|
||||
|
||||
return letterfreq
|
2
src/textprocessor/wordanalysis.py
Normal file
2
src/textprocessor/wordanalysis.py
Normal file
@ -0,0 +1,2 @@
|
||||
def analyzeWords(text):
|
||||
pass
|
128
src/tools/wikisource_downloader.py
Normal file
128
src/tools/wikisource_downloader.py
Normal file
@ -0,0 +1,128 @@
|
||||
import urllib
|
||||
from pyquery import PyQuery
|
||||
import sqlite3
|
||||
import re
|
||||
|
||||
BASE_URL = "https://ro.wikisource.org"
|
||||
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
|
||||
def getAuthorList():
|
||||
authors = []
|
||||
for letter in LETTERS:
|
||||
print("Processing link page for letter", letter)
|
||||
# Read index page
|
||||
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
|
||||
data = urllib.request.urlopen(url).read()
|
||||
q = PyQuery(data)
|
||||
for item in q("div.mw-category-generated").find("a"):
|
||||
if (item.text.startswith("Autor:")):
|
||||
authorname = item.text[6:]
|
||||
authorlink = item.attrib['href']
|
||||
authors.append((authorname, authorlink))
|
||||
return list(set(authors))
|
||||
|
||||
def getAuthorWikiLink(query):
|
||||
wikilink = None
|
||||
body = query("div#mw-content-text")
|
||||
table = body.find("table")
|
||||
for link in table.find("a"):
|
||||
if "ro.wikipedia.org" in link.attrib['href']:
|
||||
wikilink = link.attrib['href']
|
||||
return wikilink
|
||||
|
||||
def getAuthorLinksList(authorname, query):
|
||||
links = []
|
||||
body = query("div#mw-content-text")
|
||||
for link in body.find("a"):
|
||||
address = link.attrib['href']
|
||||
ok = True
|
||||
if "http" in address:
|
||||
ok = False
|
||||
if "redlink" in address:
|
||||
ok = False
|
||||
if "Fi%C8%99ier:" in address:
|
||||
ok = False
|
||||
if "index.php" in address:
|
||||
ok = False
|
||||
if address.startswith("#"):
|
||||
ok = False
|
||||
if "Autor:" in address:
|
||||
ok = False
|
||||
if ok:
|
||||
links.append(link.attrib['href'])
|
||||
return links
|
||||
|
||||
def getAuthorBasicInfo(authorname, authorlink):
|
||||
info = {}
|
||||
data = urllib.request.urlopen(BASE_URL + authorlink).read()
|
||||
q = PyQuery(data)
|
||||
|
||||
info["wiki"] = getAuthorWikiLink(q)
|
||||
info["links"] = getAuthorLinksList(authorname, q)
|
||||
|
||||
return info
|
||||
|
||||
# def getAuthorWikiInfo(authorinfo):
|
||||
|
||||
# # Nothing can be learned without wiki page
|
||||
# if authorinfo["wiki"] is None:
|
||||
# return authorinfo
|
||||
|
||||
# try:
|
||||
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
|
||||
# q = PyQuery(data)
|
||||
|
||||
# # Find the birth date
|
||||
# body = q("#mw-content-text").text()
|
||||
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
|
||||
# if not result is None:
|
||||
# authorinfo["birthyear"] = result.group(0)
|
||||
|
||||
# except urllib.error.HTTPError:
|
||||
# pass
|
||||
|
||||
# return authorinfo
|
||||
|
||||
def getText(url):
|
||||
data = urllib.request.urlopen(BASE_URL + url).read()
|
||||
q = PyQuery(data)
|
||||
|
||||
texttitle = q("h1").text()
|
||||
|
||||
body = q("#mw-content-text")
|
||||
body.find("table").remove()
|
||||
|
||||
textcontent = body.text()
|
||||
return (texttitle, textcontent)
|
||||
|
||||
def addAuthorToDb(authorinfo):
|
||||
con = sqlite3.connect("data/texts.db")
|
||||
c = con.cursor()
|
||||
c.execute("INSERT INTO Authors")
|
||||
|
||||
def getAllTexts():
|
||||
|
||||
con = sqlite3.connect("data/texts.db")
|
||||
c = con.cursor()
|
||||
#c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
|
||||
id = 1
|
||||
|
||||
authors = getAuthorList()
|
||||
for authorname, authorlink in authors:
|
||||
print("Processing author", authorname)
|
||||
authorinfo = getAuthorBasicInfo(authorname, authorlink)
|
||||
c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
|
||||
|
||||
# authorinfo = getAuthorWikiInfo(authorinfo)
|
||||
for text in authorinfo["links"]:
|
||||
try:
|
||||
title, content = getText(text)
|
||||
c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
|
||||
c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
|
||||
id = id + 1
|
||||
except urllib.error.HTTPError:
|
||||
continue
|
||||
|
||||
con.commit()
|
||||
|
||||
getAllTexts()
|
0
src/ttl/__init__.py
Normal file
0
src/ttl/__init__.py
Normal file
62
src/ttl/ttlparser.py
Normal file
62
src/ttl/ttlparser.py
Normal file
@ -0,0 +1,62 @@
|
||||
'''
|
||||
Created on May 22, 2016
|
||||
|
||||
@author: tibi
|
||||
'''
|
||||
|
||||
from xml.dom import minidom;
|
||||
from xml.parsers.expat import ExpatError
|
||||
from model.Word import Word
|
||||
|
||||
def parseText(xmlText):
|
||||
|
||||
words = []
|
||||
chunks = {}
|
||||
|
||||
sentence_i = 0
|
||||
|
||||
# get the root "segs" element
|
||||
try:
|
||||
dom = minidom.parseString(xmlText)
|
||||
except ExpatError as e:
|
||||
print("Error in text:", xmlText)
|
||||
print(e)
|
||||
exit(-1)
|
||||
|
||||
alltext = dom.getElementsByTagName("segs")
|
||||
|
||||
# iterate paragraphs
|
||||
for paragraph in alltext[0].getElementsByTagName("seg"):
|
||||
|
||||
# iterate sentences
|
||||
for sentence in paragraph.getElementsByTagName("s"):
|
||||
|
||||
# increment sentence index
|
||||
sentence_i += 1
|
||||
word_i = 0
|
||||
|
||||
# iterate words
|
||||
for word in sentence.getElementsByTagName("w"):
|
||||
|
||||
# increment word index
|
||||
word_i += 1
|
||||
|
||||
# obtain word info
|
||||
wordText = word.firstChild.data
|
||||
lemma = word.getAttribute("lemma")
|
||||
ana = word.getAttribute("ana")
|
||||
chunk = word.getAttribute("chunk")
|
||||
|
||||
# create word
|
||||
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
|
||||
#words.append(w)
|
||||
|
||||
for c in chunk.split(","):
|
||||
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
|
||||
words.append(w)
|
||||
if chunks.get((sentence_i, c)) == None:
|
||||
chunks[(sentence_i, c)] = [ w ]
|
||||
else:
|
||||
chunks[(sentence_i, c)].append(w)
|
||||
|
||||
return (words, chunks)
|
34
src/ttl/ttlservice.py
Normal file
34
src/ttl/ttlservice.py
Normal file
@ -0,0 +1,34 @@
|
||||
# coding: utf-8
|
||||
import zeep
|
||||
|
||||
def executeTtl(text):
|
||||
# Preprocess the text
|
||||
text = text.replace(u'ĭ', 'i')
|
||||
text = text.replace(u'ŭ', 'u')
|
||||
text = text.replace(u'à', 'a')
|
||||
|
||||
client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
|
||||
textSgml = client.service.UTF8toSGML(text)
|
||||
result = client.service.XCES("ro", "id", textSgml)
|
||||
|
||||
# Cleanup result - generate valid xml
|
||||
result = result.replace('’', '`')
|
||||
result = result.replace('ă', u'ă')
|
||||
result = result.replace('à', u'à')
|
||||
result = result.replace('â', u'â')
|
||||
result = result.replace('î', u'î')
|
||||
result = result.replace('ş', u'ș')
|
||||
result = result.replace('ţ', u'ț')
|
||||
result = result.replace('ŭ', u'u')
|
||||
result = result.replace('Ă', u'Ă')
|
||||
result = result.replace('À', u'À')
|
||||
result = result.replace('Â', u'Â')
|
||||
result = result.replace('Î', u'Î')
|
||||
result = result.replace('Ş', u'Ș')
|
||||
result = result.replace('Ţ', u'Ț')
|
||||
result = result.replace('Ŭ', u'U')
|
||||
|
||||
xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
|
||||
xmlResult += result
|
||||
xmlResult += "</segs>"
|
||||
return xmlResult
|
Loading…
Reference in New Issue
Block a user