Initial commit
This commit is contained in:
commit
6badfbd103
107
.gitignore
vendored
Normal file
107
.gitignore
vendored
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
.vscode/*
|
||||||
|
!.vscode/settings.json
|
||||||
|
!.vscode/tasks.json
|
||||||
|
!.vscode/launch.json
|
||||||
|
!.vscode/extensions.json
|
||||||
|
|
||||||
|
logs/*
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
.hypothesis/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
220
.vscode/launch.json
vendored
Normal file
220
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,220 @@
|
|||||||
|
{
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Run without debugging",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": false,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"program": "${file}",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Python",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": true,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"program": "${file}",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PySpark",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": true,
|
||||||
|
"osx": {
|
||||||
|
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
|
||||||
|
},
|
||||||
|
"windows": {
|
||||||
|
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd"
|
||||||
|
},
|
||||||
|
"linux": {
|
||||||
|
"pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
|
||||||
|
},
|
||||||
|
"program": "${file}",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Python Module",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": true,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"module": "module.name",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Integrated Terminal/Console",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": true,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"program": "${file}",
|
||||||
|
"cwd": "",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "External Terminal/Console",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": true,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"program": "${file}",
|
||||||
|
"cwd": "",
|
||||||
|
"console": "externalTerminal",
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Django",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": true,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"program": "${workspaceRoot}/manage.py",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"args": [
|
||||||
|
"runserver",
|
||||||
|
"--noreload"
|
||||||
|
],
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput",
|
||||||
|
"DjangoDebugging"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Flask",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": false,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"program": "fully qualified path fo 'flask' executable. Generally located along with python interpreter",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"env": {
|
||||||
|
"FLASK_APP": "${workspaceRoot}/quickstart/app.py"
|
||||||
|
},
|
||||||
|
"args": [
|
||||||
|
"run",
|
||||||
|
"--no-debugger",
|
||||||
|
"--no-reload"
|
||||||
|
],
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Flask (old)",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": false,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"program": "${workspaceRoot}/run.py",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"args": [],
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Pyramid",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": true,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"args": [
|
||||||
|
"${workspaceRoot}/development.ini"
|
||||||
|
],
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput",
|
||||||
|
"Pyramid"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Watson",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"stopOnEntry": true,
|
||||||
|
"pythonPath": "${config:python.pythonPath}",
|
||||||
|
"program": "${workspaceRoot}/console.py",
|
||||||
|
"cwd": "${workspaceRoot}",
|
||||||
|
"args": [
|
||||||
|
"dev",
|
||||||
|
"runserver",
|
||||||
|
"--noreload=True"
|
||||||
|
],
|
||||||
|
"env": {},
|
||||||
|
"envFile": "${workspaceRoot}/.env",
|
||||||
|
"debugOptions": [
|
||||||
|
"WaitOnAbnormalExit",
|
||||||
|
"WaitOnNormalExit",
|
||||||
|
"RedirectOutput"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Attach (Remote Debug)",
|
||||||
|
"type": "python",
|
||||||
|
"request": "attach",
|
||||||
|
"localRoot": "${workspaceRoot}",
|
||||||
|
"remoteRoot": "${workspaceRoot}",
|
||||||
|
"port": 3000,
|
||||||
|
"secret": "my_secret",
|
||||||
|
"host": "localhost"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
// Place your settings in this file to overwrite default and user settings.
|
||||||
|
{
|
||||||
|
}
|
17
NLP/.project
Normal file
17
NLP/.project
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<projectDescription>
|
||||||
|
<name>NLP</name>
|
||||||
|
<comment></comment>
|
||||||
|
<projects>
|
||||||
|
</projects>
|
||||||
|
<buildSpec>
|
||||||
|
<buildCommand>
|
||||||
|
<name>org.python.pydev.PyDevBuilder</name>
|
||||||
|
<arguments>
|
||||||
|
</arguments>
|
||||||
|
</buildCommand>
|
||||||
|
</buildSpec>
|
||||||
|
<natures>
|
||||||
|
<nature>org.python.pydev.pythonNature</nature>
|
||||||
|
</natures>
|
||||||
|
</projectDescription>
|
8
NLP/.pydevproject
Normal file
8
NLP/.pydevproject
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<?eclipse-pydev version="1.0"?><pydev_project>
|
||||||
|
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
|
||||||
|
<path>/${PROJECT_DIR_NAME}/src</path>
|
||||||
|
</pydev_pathproperty>
|
||||||
|
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 3.0</pydev_property>
|
||||||
|
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 3.5</pydev_property>
|
||||||
|
</pydev_project>
|
1
NLP/data/text1.txt
Normal file
1
NLP/data/text1.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Maria are mere. Ea mai are șapte pere. Acestea sunt foarte delicioase.
|
6
NLP/data/text1_processed.xml
Normal file
6
NLP/data/text1_processed.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<segs>
|
||||||
|
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Maria" ana="Np" chunk="Np#1">Maria</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="măr" ana="Ncfp-n" chunk="Np#2">mere</w><c>.</c></s></seg>
|
||||||
|
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="el" ana="Pp3fsr--------s">Ea</w><w lemma="mai" ana="Rp" chunk="Ap#1">mai</w><w lemma="avea" ana="Vmip3s" chunk="Vp#1">are</w><w lemma="șapte" ana="Mc-p-l" chunk="Np#1">șapte</w><w lemma="pară" ana="Ncfp-n" chunk="Np#1">pere</w><c>.</c></s></seg>
|
||||||
|
<seg lang="ro"><s id="id_temp_aiurea.3"><w lemma="acesta" ana="Pd3fpr">Acestea</w><w lemma="fi" ana="Vmip3p" chunk="Vp#1">sunt</w><w lemma="foarte" ana="Rp" chunk="Ap#1,Vp#1">foarte</w><w lemma="delicios" ana="Afpfp-n" chunk="Ap#1">delicioase</w><c>.</c></s></seg>
|
||||||
|
</segs>
|
1
NLP/data/text2.txt
Normal file
1
NLP/data/text2.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Sabeer Bhatia a ajuns la Aeroportul Internațional din Los Angeles la ora 18 în data de 23 septembrie 1998. Zborul său din Bangalore a durat 22 ore, și el era înfometat.
|
5
NLP/data/text2_processed.xml
Normal file
5
NLP/data/text2_processed.xml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<segs>
|
||||||
|
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sabeer" ana="Np" chunk="Np#1">Sabeer</w><w lemma="Bhatia" ana="Np" chunk="Np#1">Bhatia</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="ajunge" ana="Vmp--sm" chunk="Vp#1">ajuns</w><w lemma="la" ana="Spsa" chunk="Pp#1">la</w><w lemma="aeroport" ana="Ncmsry" chunk="Pp#1,Np#2">Aeroportul</w><w lemma="internațional" ana="Afpms-n" chunk="Pp#1,Np#2,Ap#1">Internațional</w><w lemma="din" ana="Spsa" chunk="Pp#2">din</w><w lemma="Los" ana="Np" chunk="Pp#2,Np#3">Los</w><w lemma="Angeles" ana="Np" chunk="Pp#2,Np#3">Angeles</w><w lemma="la" ana="Spsa" chunk="Pp#3">la</w><w lemma="oră" ana="Ncfsry" chunk="Pp#3,Np#4">ora</w><w lemma="18" ana="Mc" chunk="Pp#3,Np#4">18</w><w lemma="în" ana="Spsa" chunk="Pp#4">în</w><w lemma="dată" ana="Ncfsry" chunk="Pp#4,Np#5">data</w><w lemma="de" ana="Spsa" chunk="Pp#5">de</w><w lemma="23" ana="Mc" chunk="Pp#5,Np#6">23</w><w lemma="septembrie" ana="Ncms-n" chunk="Pp#5,Np#6">septembrie</w><w lemma="1998" ana="Mc" chunk="Pp#5,Np#6">1998</w><c>.</c></s></seg>
|
||||||
|
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="zbor" ana="Ncmsry" chunk="Np#1">Zborul</w><w lemma="său" ana="Ds3ms-s" chunk="Np#1">său</w><w lemma="din" ana="Spsa" chunk="Pp#1">din</w><w lemma="Bangalore" ana="Np" chunk="Pp#1,Np#2">Bangalore</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dura" ana="Vmp--sm" chunk="Vp#1">durat</w><w lemma="22" ana="Mc" chunk="Np#3">22</w><w lemma="oră" ana="Ncfp-n" chunk="Np#3">ore</w><c>,</c><w lemma="și" ana="Crssp">și</w><w lemma="el" ana="Pp3msr--------s" chunk="Vp#2">el</w><w lemma="fi" ana="Vaii3s" chunk="Vp#2">era</w><w lemma="înfometa" ana="Vmp--sm" chunk="Vp#2">înfometat</w><c>.</c></s></seg>
|
||||||
|
</segs>
|
1
NLP/data/text3.txt
Normal file
1
NLP/data/text3.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Sophia Loren spune că ea va fi întotdeauna mulțumitoare față de Bono. Actrița a dezvăluit că cântărețul trupei U2 a ajutat-o să se liniștească atunci când ea s-a speriat de o furtună în timp ce zburau cu avionul.
|
5
NLP/data/text3_processed.xml
Normal file
5
NLP/data/text3_processed.xml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<segs>
|
||||||
|
<seg lang="ro"><s id="id_temp_aiurea.1"><w lemma="Sophia" ana="Np" chunk="Np#1">Sophia</w><w lemma="Loren" ana="Np" chunk="Np#1">Loren</w><w lemma="spune" ana="Vmnp" chunk="Vp#1">spune</w><w lemma="că" ana="Csssp">că</w><w lemma="el" ana="Pp3fsr--------s" chunk="Vp#2">ea</w><w lemma="vrea" ana="Va--3s" chunk="Vp#2">va</w><w lemma="fi" ana="Vmnp" chunk="Vp#2">fi</w><w lemma="întotdeauna" ana="Rgp" chunk="Vp#2,Ap#1">întotdeauna</w><w lemma="mulțumitor" ana="Afpf--n" chunk="Ap#1">mulțumitoare</w><w lemma="față_de" ana="Spca" chunk="Pp#1">față_de</w><w lemma="bonă" ana="Ncfsvy" chunk="Pp#1,Np#2">Bono</w><c>.</c></s></seg>
|
||||||
|
<seg lang="ro"><s id="id_temp_aiurea.2"><w lemma="actriță" ana="Ncfsry" chunk="Np#1">Actrița</w><w lemma="avea" ana="Va--3s" chunk="Vp#1">a</w><w lemma="dezvălui" ana="Vmp--sm" chunk="Vp#1">dezvăluit</w><w lemma="că" ana="Csssp">că</w><w lemma="cântăreț" ana="Ncmsry" chunk="Np#2">cântărețul</w><w lemma="trupă" ana="Ncfsoy" chunk="Np#2">trupei</w><w lemma="U2" ana="Np" chunk="Np#2">U2</w><w lemma="avea" ana="Va--3s" chunk="Vp#2">a</w><w lemma="ajuta" ana="Vmp--sm" chunk="Vp#2">ajutat</w><w lemma="el" ana="Pp3fsa--y-----w">-o</w><w lemma="să" ana="Qs" chunk="Vp#3">să</w><w lemma="sine" ana="Px3--a--------w" chunk="Vp#3">se</w><w lemma="liniști" ana="Vmsp3" chunk="Vp#3">liniștească</w><w lemma="atunci_când" ana="Rw" chunk="Vp#3,Ap#1">atunci_când</w><w lemma="el" ana="Pp3fsr--------s">ea</w><w lemma="sine" ana="Px3--a--y-----w" chunk="Vp#4">s-</w><w lemma="avea" ana="Va--3s" chunk="Vp#4">a</w><w lemma="speria" ana="Vmp--sm" chunk="Vp#4">speriat</w><w lemma="de" ana="Spsa" chunk="Pp#1">de</w><w lemma="un" ana="Tifsr" chunk="Pp#1,Np#3">o</w><w lemma="furtună" ana="Ncfsrn" chunk="Pp#1,Np#3">furtună</w><w lemma="în_timp_ce" ana="Cscsp">în_timp_ce</w><w lemma="zbura" ana="Vmii3p" chunk="Vp#5">zburau</w><w lemma="cu" ana="Spsa" chunk="Pp#2">cu</w><w lemma="avion" ana="Ncmsry" chunk="Pp#2,Np#4">avionul</w><c>.</c></s></seg>
|
||||||
|
</segs>
|
121
NLP/src/anaphora.py
Normal file
121
NLP/src/anaphora.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
'''
|
||||||
|
Created on May 22, 2016
|
||||||
|
|
||||||
|
@author: tibi
|
||||||
|
'''
|
||||||
|
from model import Word
|
||||||
|
|
||||||
|
def getGender(word):
|
||||||
|
|
||||||
|
if word.isPronoun() and (word.pronounGetPerson() == '1' or word.pronounGetPerson() == '2'):
|
||||||
|
return 'n'
|
||||||
|
|
||||||
|
return word.getGender()
|
||||||
|
|
||||||
|
def genderMatch(word1, word2):
|
||||||
|
|
||||||
|
g1 = getGender(word1)
|
||||||
|
g2 = getGender(word2)
|
||||||
|
|
||||||
|
if g1 == g2:
|
||||||
|
return 2
|
||||||
|
|
||||||
|
if g1 == 'n' or g2 == 'n':
|
||||||
|
return 1
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def isPrepositional(chunk):
|
||||||
|
|
||||||
|
for word in chunk:
|
||||||
|
|
||||||
|
if word.isPreposition():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def countInText(noun, text):
|
||||||
|
|
||||||
|
c = 0
|
||||||
|
for word in text:
|
||||||
|
if word.text == noun.text:
|
||||||
|
c += 1
|
||||||
|
|
||||||
|
return c
|
||||||
|
|
||||||
|
def anaphora(text, chunks):
|
||||||
|
|
||||||
|
nounPhrases = []
|
||||||
|
|
||||||
|
for word in text:
|
||||||
|
|
||||||
|
if word.isNoun():
|
||||||
|
print("[n]", word)
|
||||||
|
nounPhrases.append((word, (word.sentenceIndex, word.chunk)))
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(word)
|
||||||
|
|
||||||
|
if word.isPronoun():
|
||||||
|
|
||||||
|
candidates = []
|
||||||
|
|
||||||
|
for noun, chunkIndex in nounPhrases[:-30:-1]:
|
||||||
|
|
||||||
|
# If gender and number match
|
||||||
|
if genderMatch(word, noun) > 0 and word.getNumber() == noun.getNumber():
|
||||||
|
|
||||||
|
npInd = genderMatch(word, noun)
|
||||||
|
|
||||||
|
# definiteness
|
||||||
|
if not noun.nounIsDefinite():
|
||||||
|
npInd -= 1
|
||||||
|
|
||||||
|
# non-prepositional noun phrase
|
||||||
|
chunk = chunks[chunkIndex]
|
||||||
|
if (isPrepositional(chunk)):
|
||||||
|
npInd -= 1
|
||||||
|
|
||||||
|
# first in sentence
|
||||||
|
if noun.sentenceIndex == 1:
|
||||||
|
npInd += 1
|
||||||
|
|
||||||
|
# indicating verbs
|
||||||
|
# todo...
|
||||||
|
|
||||||
|
# lexical reiteration
|
||||||
|
c = countInText(noun, text)
|
||||||
|
if c == 2:
|
||||||
|
npInd += 1
|
||||||
|
if c > 2:
|
||||||
|
npInd += 2
|
||||||
|
|
||||||
|
# noun is representing term
|
||||||
|
# how?
|
||||||
|
|
||||||
|
# identical collocation pattern to the pronoun
|
||||||
|
# ???
|
||||||
|
|
||||||
|
# immediate reference, resolving 'it'
|
||||||
|
# applicable?
|
||||||
|
|
||||||
|
# referential distance
|
||||||
|
dist = word.sentenceIndex - noun.sentenceIndex
|
||||||
|
if dist == 0:
|
||||||
|
npInd += 2
|
||||||
|
elif dist == 1:
|
||||||
|
npInd += 1
|
||||||
|
|
||||||
|
candidates.append((noun, npInd))
|
||||||
|
print("...> Candidate: {0} npInd = {1}".format(noun, npInd))
|
||||||
|
|
||||||
|
|
||||||
|
if len(candidates) > 0:
|
||||||
|
|
||||||
|
pickedWord, pickedInd = candidates[0]
|
||||||
|
for word, npInd in candidates:
|
||||||
|
if npInd > pickedInd:
|
||||||
|
pickedInd = npInd
|
||||||
|
pickedWord = word
|
||||||
|
|
||||||
|
print(".>>> Picked: {0}".format(pickedWord))
|
55
NLP/src/fileparser.py
Normal file
55
NLP/src/fileparser.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
'''
|
||||||
|
Created on May 22, 2016
|
||||||
|
|
||||||
|
@author: tibi
|
||||||
|
'''
|
||||||
|
|
||||||
|
from xml.dom import minidom;
|
||||||
|
from model.Word import Word
|
||||||
|
|
||||||
|
def parse(filename):
|
||||||
|
|
||||||
|
words = []
|
||||||
|
chunks = {}
|
||||||
|
|
||||||
|
sentence_i = 0
|
||||||
|
|
||||||
|
# get the root "segs" element
|
||||||
|
dom = minidom.parse(filename)
|
||||||
|
alltext = dom.getElementsByTagName("segs")
|
||||||
|
|
||||||
|
# iterate paragraphs
|
||||||
|
for paragraph in alltext[0].getElementsByTagName("seg"):
|
||||||
|
|
||||||
|
# iterate sentences
|
||||||
|
for sentence in paragraph.getElementsByTagName("s"):
|
||||||
|
|
||||||
|
# increment sentence index
|
||||||
|
sentence_i += 1
|
||||||
|
word_i = 0
|
||||||
|
|
||||||
|
# iterate words
|
||||||
|
for word in sentence.getElementsByTagName("w"):
|
||||||
|
|
||||||
|
# increment word index
|
||||||
|
word_i += 1
|
||||||
|
|
||||||
|
# obtain word info
|
||||||
|
wordText = word.firstChild.data
|
||||||
|
lemma = word.getAttribute("lemma")
|
||||||
|
ana = word.getAttribute("ana")
|
||||||
|
chunk = word.getAttribute("chunk")
|
||||||
|
|
||||||
|
# create word
|
||||||
|
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
|
||||||
|
#words.append(w)
|
||||||
|
|
||||||
|
for c in chunk.split(","):
|
||||||
|
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
|
||||||
|
words.append(w)
|
||||||
|
if chunks.get((sentence_i, c)) == None:
|
||||||
|
chunks[(sentence_i, c)] = [ w ]
|
||||||
|
else:
|
||||||
|
chunks[(sentence_i, c)].append(w)
|
||||||
|
|
||||||
|
return (words, chunks)
|
26
NLP/src/main.py
Normal file
26
NLP/src/main.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
'''
|
||||||
|
Created on May 22, 2016
|
||||||
|
|
||||||
|
@author: tibi
|
||||||
|
'''
|
||||||
|
import fileparser
|
||||||
|
from anaphora import anaphora
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
words, chunks = fileparser.parse("../data/text3_processed.xml")
|
||||||
|
|
||||||
|
print("Words:")
|
||||||
|
for word in words:
|
||||||
|
print("[{0} {1}] {2}".format(word.sentenceIndex, word.wordIndex, word))
|
||||||
|
print("")
|
||||||
|
|
||||||
|
print("Chunks:")
|
||||||
|
for key, value in chunks.items():
|
||||||
|
print(key, ":")
|
||||||
|
for word in value:
|
||||||
|
print(" - ", word)
|
||||||
|
print("")
|
||||||
|
|
||||||
|
print("Anaphora resolution:")
|
||||||
|
anaphora(words, chunks)
|
88
NLP/src/model/Word.py
Normal file
88
NLP/src/model/Word.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
'''
|
||||||
|
Created on May 22, 2016
|
||||||
|
|
||||||
|
@author: tibi
|
||||||
|
'''
|
||||||
|
|
||||||
|
class Word:
|
||||||
|
|
||||||
|
text = ""
|
||||||
|
lemma = ""
|
||||||
|
ana = ""
|
||||||
|
chunk = ""
|
||||||
|
|
||||||
|
sentenceIndex = 0
|
||||||
|
wordIndex = 0
|
||||||
|
|
||||||
|
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
|
||||||
|
self.text = text
|
||||||
|
self.lemma = lemma
|
||||||
|
self.ana = ana
|
||||||
|
self.chunk = chunk
|
||||||
|
self.sentenceIndex = sentenceIndex
|
||||||
|
self.wordIndex = wordIndex
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
|
||||||
|
|
||||||
|
def isNoun(self):
|
||||||
|
return self.ana[0] == "N"
|
||||||
|
|
||||||
|
def nounIsCommon(self):
|
||||||
|
return self.isNoun() and self.ana[1] == "c"
|
||||||
|
|
||||||
|
def nounIsProper(self):
|
||||||
|
return self.isNoun() and self.ana[1] == "p"
|
||||||
|
|
||||||
|
def nounGetCase(self):
|
||||||
|
|
||||||
|
if self.isNoun():
|
||||||
|
return self.ana[4]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
'Este articulat?'
|
||||||
|
def nounIsDefinite(self):
|
||||||
|
if self.isNoun():
|
||||||
|
if (self.nounIsProper()):
|
||||||
|
return True
|
||||||
|
|
||||||
|
if len(self.ana) > 5:
|
||||||
|
return self.ana[5]
|
||||||
|
|
||||||
|
return "n"
|
||||||
|
|
||||||
|
def pronounGetPerson(self):
|
||||||
|
if self.isPronoun():
|
||||||
|
return self.ana[2]
|
||||||
|
|
||||||
|
def getGender(self):
|
||||||
|
if self.isNoun():
|
||||||
|
if (len(self.ana) >= 3):
|
||||||
|
return self.ana[2]
|
||||||
|
return 'n'
|
||||||
|
|
||||||
|
if self.isPronoun():
|
||||||
|
return self.ana[3]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getNumber(self):
|
||||||
|
if self.isNoun():
|
||||||
|
if self.nounIsProper():
|
||||||
|
return 's'
|
||||||
|
else:
|
||||||
|
return self.ana[3]
|
||||||
|
if self.isPronoun():
|
||||||
|
return self.ana[4]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def isPronoun(self):
|
||||||
|
return self.ana[0] == "P"
|
||||||
|
|
||||||
|
def isVerb(self):
|
||||||
|
return self.ana[0] == "V"
|
||||||
|
|
||||||
|
def isPreposition(self):
|
||||||
|
return self.ana[0] == "S" and self.ana[1] == "p"
|
0
NLP/src/model/__init__.py
Normal file
0
NLP/src/model/__init__.py
Normal file
1
data/index.csv
Normal file
1
data/index.csv
Normal file
@ -0,0 +1 @@
|
|||||||
|
books/Moara cu noroc - Ioan Slavici.epub;Moara cu noroc;1880;Ioan Slavici;1848;Transilvania
|
|
BIN
data/results.db
Normal file
BIN
data/results.db
Normal file
Binary file not shown.
BIN
data/texts.db
Normal file
BIN
data/texts.db
Normal file
Binary file not shown.
BIN
data/texts.db.bak
Normal file
BIN
data/texts.db.bak
Normal file
Binary file not shown.
BIN
data/texts2.db
Normal file
BIN
data/texts2.db
Normal file
Binary file not shown.
20
src/logger.py
Normal file
20
src/logger.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import time
|
||||||
|
import logging
|
||||||
|
|
||||||
|
def init_logger(level):
|
||||||
|
# Log filename
|
||||||
|
tm = time.strftime('%Y-%m-%d_%H-%M-%S')
|
||||||
|
logFile = "logs/log_{0}.log".format(tm)
|
||||||
|
|
||||||
|
# Set up file logger
|
||||||
|
logging.basicConfig(filename=logFile,
|
||||||
|
level=logging.DEBUG,
|
||||||
|
format='%(asctime)s %(name)s %(levelname)s %(message)s',
|
||||||
|
datefmt='%m-%d %H:%M')
|
||||||
|
|
||||||
|
# Set up console logger
|
||||||
|
formatter = logging.Formatter('[%(name)s] %(levelname)s: %(message)s')
|
||||||
|
console = logging.StreamHandler()
|
||||||
|
console.setLevel(level)
|
||||||
|
console.setFormatter(formatter)
|
||||||
|
logging.getLogger().addHandler(console)
|
40
src/main.py
Normal file
40
src/main.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
# own
|
||||||
|
import logger
|
||||||
|
import storage.data
|
||||||
|
import storage.results
|
||||||
|
import textprocessor.letterfreq
|
||||||
|
import ttl.ttlparser
|
||||||
|
import ttl.ttlservice
|
||||||
|
|
||||||
|
def init():
|
||||||
|
logger.init_logger(logging.WARNING)
|
||||||
|
storage.data.initializeFragmentDatabase("data/texts.db")
|
||||||
|
storage.results.initializeResultsDatabase("data/results.db", True)
|
||||||
|
|
||||||
|
def processTexts():
|
||||||
|
count = storage.data.getTextCount()
|
||||||
|
current = 0
|
||||||
|
for item in storage.data.getAllTexts():
|
||||||
|
print("Processing item", current, "out of", count)
|
||||||
|
current = current + 1
|
||||||
|
|
||||||
|
itemid = item[0]
|
||||||
|
itemtext = item[1]
|
||||||
|
|
||||||
|
# obtain ttl analysis
|
||||||
|
# unfeasable - it takes 5-10 minutes for a single text
|
||||||
|
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
|
||||||
|
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
|
||||||
|
# storage.results.storeTtlAnalysis(itemid, words)
|
||||||
|
|
||||||
|
# perform analysis
|
||||||
|
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
|
||||||
|
storage.results.storeFrequencies(itemid, letterFreq)
|
||||||
|
|
||||||
|
print("Finished!")
|
||||||
|
|
||||||
|
init()
|
||||||
|
processTexts()
|
||||||
|
|
32
src/model.py
Normal file
32
src/model.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
# Defines a fragment author
|
||||||
|
class Author:
|
||||||
|
def __init__(self, name = "", birthYear = "", location = "Romania"):
|
||||||
|
self.name = name
|
||||||
|
self.yearOfBirth = birthYear
|
||||||
|
self.location = location
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
return "[Author name={0} yearOfBirth={1} location={2}]".format(self.name, self.yearOfBirth, self.location)
|
||||||
|
|
||||||
|
# Defines a text fragment
|
||||||
|
class Fragment:
|
||||||
|
def __init__(self, title = "", text = "", author = Author(), year = 1999):
|
||||||
|
self.title = title
|
||||||
|
self.text = text
|
||||||
|
self.author = author
|
||||||
|
self.year = year
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.title
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.title
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
return "[Fragment title={0} author={1} year={2} text={3}]".format(self.title, self.author.dump(), self.year, self.text)
|
87
src/model/Word.py
Normal file
87
src/model/Word.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
|
||||||
|
# Defines a processed word
|
||||||
|
class Word:
|
||||||
|
|
||||||
|
text = ""
|
||||||
|
lemma = ""
|
||||||
|
ana = ""
|
||||||
|
chunk = ""
|
||||||
|
|
||||||
|
sentenceIndex = 0
|
||||||
|
wordIndex = 0
|
||||||
|
|
||||||
|
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
|
||||||
|
self.text = text
|
||||||
|
self.lemma = lemma
|
||||||
|
self.ana = ana
|
||||||
|
self.chunk = chunk
|
||||||
|
self.sentenceIndex = sentenceIndex
|
||||||
|
self.wordIndex = wordIndex
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self)
|
||||||
|
|
||||||
|
def isNoun(self):
|
||||||
|
return self.ana[0] == "N"
|
||||||
|
|
||||||
|
def nounIsCommon(self):
|
||||||
|
return self.isNoun() and self.ana[1] == "c"
|
||||||
|
|
||||||
|
def nounIsProper(self):
|
||||||
|
return self.isNoun() and self.ana[1] == "p"
|
||||||
|
|
||||||
|
def nounGetCase(self):
|
||||||
|
|
||||||
|
if self.isNoun():
|
||||||
|
return self.ana[4]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
'Este articulat?'
|
||||||
|
def nounIsDefinite(self):
|
||||||
|
if self.isNoun():
|
||||||
|
if (self.nounIsProper()):
|
||||||
|
return True
|
||||||
|
|
||||||
|
if len(self.ana) > 5:
|
||||||
|
return self.ana[5]
|
||||||
|
|
||||||
|
return "n"
|
||||||
|
|
||||||
|
def pronounGetPerson(self):
|
||||||
|
if self.isPronoun():
|
||||||
|
return self.ana[2]
|
||||||
|
|
||||||
|
def getGender(self):
|
||||||
|
if self.isNoun():
|
||||||
|
if (len(self.ana) >= 3):
|
||||||
|
return self.ana[2]
|
||||||
|
return 'n'
|
||||||
|
|
||||||
|
if self.isPronoun():
|
||||||
|
return self.ana[3]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getNumber(self):
|
||||||
|
if self.isNoun():
|
||||||
|
if self.nounIsProper():
|
||||||
|
return 's'
|
||||||
|
else:
|
||||||
|
return self.ana[3]
|
||||||
|
if self.isPronoun():
|
||||||
|
return self.ana[4]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def isPronoun(self):
|
||||||
|
return self.ana[0] == "P"
|
||||||
|
|
||||||
|
def isVerb(self):
|
||||||
|
return self.ana[0] == "V"
|
||||||
|
|
||||||
|
def isPreposition(self):
|
||||||
|
return self.ana[0] == "S" and self.ana[1] == "p"
|
0
src/model/__init__.py
Normal file
0
src/model/__init__.py
Normal file
0
src/storage/__init__.py
Normal file
0
src/storage/__init__.py
Normal file
80
src/storage/data.py
Normal file
80
src/storage/data.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from model import *
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
log = logging.getLogger("storage")
|
||||||
|
|
||||||
|
DB_FRAGMENTS = ""
|
||||||
|
|
||||||
|
# Commands
|
||||||
|
|
||||||
|
# birth location - general area, not exact location (i.e. Transylvania)
|
||||||
|
# birth origin - rural or urban
|
||||||
|
# studies - masters, bachelors, high school, middle school, primary school
|
||||||
|
# occupation - comma separated if there are multiple
|
||||||
|
# studiesAbroad - foreign cities where author studied (comma separated)
|
||||||
|
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
|
||||||
|
name TEXT PRIMARY KEY,
|
||||||
|
birthYear INTEGER,
|
||||||
|
birthLocation TEXT,
|
||||||
|
birthOrigin TEXT,
|
||||||
|
studies TEXT,
|
||||||
|
occupations TEXT,
|
||||||
|
studiesAbroad TEXT
|
||||||
|
)"""
|
||||||
|
|
||||||
|
# genre - short story (nuvela), novel (roman), poem etc
|
||||||
|
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
|
||||||
|
# tags - other relevant information (i.e. psychological)
|
||||||
|
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
title TEXT,
|
||||||
|
year INTEGER,
|
||||||
|
author TEXT REFERENCES Authors(name),
|
||||||
|
genre TEXT,
|
||||||
|
movement TEXT,
|
||||||
|
tags TEXT
|
||||||
|
)"""
|
||||||
|
|
||||||
|
# contains the actual text
|
||||||
|
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
|
||||||
|
id INTEGER REFERENCES Fragments(id),
|
||||||
|
content TEXT
|
||||||
|
)"""
|
||||||
|
|
||||||
|
# Initialize databases
|
||||||
|
def initializeFragmentDatabase(dbFile):
|
||||||
|
global DB_FRAGMENTS
|
||||||
|
DB_FRAGMENTS = dbFile
|
||||||
|
|
||||||
|
if not os.path.exists(dbFile):
|
||||||
|
log.info("Text database %s not found. Will create database.", dbFile)
|
||||||
|
con = sqlite3.connect(dbFile)
|
||||||
|
c = con.cursor()
|
||||||
|
c.execute(COMMAND_CREATE_AUTHORS)
|
||||||
|
c.execute(COMMAND_CREATE_FRAGMENTS)
|
||||||
|
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
|
||||||
|
con.commit()
|
||||||
|
con.close()
|
||||||
|
log.info("Database created!")
|
||||||
|
|
||||||
|
def getTextCount():
|
||||||
|
con = sqlite3.connect(DB_FRAGMENTS)
|
||||||
|
c = con.cursor()
|
||||||
|
c.execute("SELECT COUNT(*) FROM Fragments")
|
||||||
|
item = c.fetchone()
|
||||||
|
c.close()
|
||||||
|
con.close()
|
||||||
|
return item[0]
|
||||||
|
|
||||||
|
def getAllTexts():
|
||||||
|
con = sqlite3.connect(DB_FRAGMENTS)
|
||||||
|
c = con.cursor()
|
||||||
|
c.execute("SELECT id, content FROM FragmentsContent")
|
||||||
|
|
||||||
|
items = c.fetchall()
|
||||||
|
|
||||||
|
c.close()
|
||||||
|
con.close()
|
||||||
|
return items
|
84
src/storage/results.py
Normal file
84
src/storage/results.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from model.Word import *
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
log = logging.getLogger("storage")
|
||||||
|
|
||||||
|
DB_RESULTS = ""
|
||||||
|
|
||||||
|
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
|
||||||
|
idtext INTEGER,
|
||||||
|
lettergroup TEXT,
|
||||||
|
category TEXT,
|
||||||
|
frequency REAL
|
||||||
|
)"""
|
||||||
|
|
||||||
|
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
|
||||||
|
idtext INTEGER,
|
||||||
|
wordIndex INTEGER,
|
||||||
|
sentenceIndex INTEGER,
|
||||||
|
word TEXT,
|
||||||
|
lemma TEXT,
|
||||||
|
analysis TEXT,
|
||||||
|
chunk TEXT
|
||||||
|
)"""
|
||||||
|
|
||||||
|
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
|
||||||
|
# idtext INTEGER,
|
||||||
|
# wordlength INTEGER,
|
||||||
|
# frequency REAL
|
||||||
|
# )"""
|
||||||
|
|
||||||
|
def initializeResultsDatabase(dbFile, cleanupOldData):
|
||||||
|
global DB_RESULTS
|
||||||
|
DB_RESULTS = dbFile
|
||||||
|
|
||||||
|
# cleanup old data
|
||||||
|
if cleanupOldData:
|
||||||
|
con = sqlite3.connect(DB_RESULTS)
|
||||||
|
c = con.cursor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
c.execute("DROP TABLE LetterFrequencies")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
|
||||||
|
|
||||||
|
try:
|
||||||
|
c.execute("DROP TABLE TextWords")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
c.execute(COMMAND_CREATE_TEXT_WORDS)
|
||||||
|
|
||||||
|
con.commit()
|
||||||
|
c.close()
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
|
||||||
|
def storeFrequencies(idtext, freq):
|
||||||
|
con = sqlite3.connect(DB_RESULTS)
|
||||||
|
c = con.cursor()
|
||||||
|
|
||||||
|
# add data
|
||||||
|
chr = ['p', 'l1', 'l2', 'l3']
|
||||||
|
for i in range(4):
|
||||||
|
for let, fr in freq[i]:
|
||||||
|
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
|
||||||
|
|
||||||
|
con.commit()
|
||||||
|
c.close()
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
def storeTtlAnalysis(idtext, words):
|
||||||
|
con = sqlite3.connect(DB_RESULTS)
|
||||||
|
c = con.cursor()
|
||||||
|
|
||||||
|
# store words
|
||||||
|
for word in words:
|
||||||
|
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
|
||||||
|
|
||||||
|
# finish
|
||||||
|
con.commit()
|
||||||
|
c.close()
|
||||||
|
con.close()
|
14
src/test.py
Normal file
14
src/test.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
from ttl import ttlservice
|
||||||
|
from ttl import ttlparser
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
import storage
|
||||||
|
|
||||||
|
data = storage.parseIndex("data")
|
||||||
|
print(data)
|
||||||
|
|
||||||
|
#textXml = ttlservice.executeTtl(u"Numele meu este Tibi și îmi place să cânt la chitară bass. Ce faci?")
|
||||||
|
#words, chunks = ttlparser.parseText(textXml)
|
||||||
|
#print ("Words: ", words)
|
||||||
|
#print ("Chunks: ", chunks)
|
0
src/textprocessor/__init__.py
Normal file
0
src/textprocessor/__init__.py
Normal file
38
src/textprocessor/letterfreq.py
Normal file
38
src/textprocessor/letterfreq.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import operator
|
||||||
|
import storage
|
||||||
|
|
||||||
|
def letterFrequencies(text):
|
||||||
|
letterfreq = [{}, {}, {}, {}]
|
||||||
|
lettersum = [0, 0, 0, 0]
|
||||||
|
|
||||||
|
n = len(text)
|
||||||
|
for i in range(n):
|
||||||
|
|
||||||
|
# compute substring frequency
|
||||||
|
# l = substring length
|
||||||
|
for l in range(1, 4):
|
||||||
|
sub = text[i : i + l].lower()
|
||||||
|
if len(sub) == l and sub.isalnum():
|
||||||
|
lettersum[l] += 1
|
||||||
|
if not sub in letterfreq[l]:
|
||||||
|
letterfreq[l][sub] = 1
|
||||||
|
else:
|
||||||
|
letterfreq[l][sub] += 1
|
||||||
|
|
||||||
|
# compute punctuation frequency
|
||||||
|
chr = text[i]
|
||||||
|
if not chr.isalnum() and not chr.isspace() and chr.isprintable():
|
||||||
|
lettersum[0] += 1
|
||||||
|
if not chr in letterfreq[0]:
|
||||||
|
letterfreq[0][chr] = 1
|
||||||
|
else:
|
||||||
|
letterfreq[0][chr] += 1
|
||||||
|
|
||||||
|
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
|
||||||
|
for i in range(4):
|
||||||
|
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
|
||||||
|
freqFiltered = freqSorted[0:50]
|
||||||
|
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
|
||||||
|
letterfreq[i] = freqNormalized
|
||||||
|
|
||||||
|
return letterfreq
|
2
src/textprocessor/wordanalysis.py
Normal file
2
src/textprocessor/wordanalysis.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
def analyzeWords(text):
|
||||||
|
pass
|
128
src/tools/wikisource_downloader.py
Normal file
128
src/tools/wikisource_downloader.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
import urllib
|
||||||
|
from pyquery import PyQuery
|
||||||
|
import sqlite3
|
||||||
|
import re
|
||||||
|
|
||||||
|
BASE_URL = "https://ro.wikisource.org"
|
||||||
|
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||||
|
|
||||||
|
def getAuthorList():
|
||||||
|
authors = []
|
||||||
|
for letter in LETTERS:
|
||||||
|
print("Processing link page for letter", letter)
|
||||||
|
# Read index page
|
||||||
|
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
|
||||||
|
data = urllib.request.urlopen(url).read()
|
||||||
|
q = PyQuery(data)
|
||||||
|
for item in q("div.mw-category-generated").find("a"):
|
||||||
|
if (item.text.startswith("Autor:")):
|
||||||
|
authorname = item.text[6:]
|
||||||
|
authorlink = item.attrib['href']
|
||||||
|
authors.append((authorname, authorlink))
|
||||||
|
return list(set(authors))
|
||||||
|
|
||||||
|
def getAuthorWikiLink(query):
|
||||||
|
wikilink = None
|
||||||
|
body = query("div#mw-content-text")
|
||||||
|
table = body.find("table")
|
||||||
|
for link in table.find("a"):
|
||||||
|
if "ro.wikipedia.org" in link.attrib['href']:
|
||||||
|
wikilink = link.attrib['href']
|
||||||
|
return wikilink
|
||||||
|
|
||||||
|
def getAuthorLinksList(authorname, query):
|
||||||
|
links = []
|
||||||
|
body = query("div#mw-content-text")
|
||||||
|
for link in body.find("a"):
|
||||||
|
address = link.attrib['href']
|
||||||
|
ok = True
|
||||||
|
if "http" in address:
|
||||||
|
ok = False
|
||||||
|
if "redlink" in address:
|
||||||
|
ok = False
|
||||||
|
if "Fi%C8%99ier:" in address:
|
||||||
|
ok = False
|
||||||
|
if "index.php" in address:
|
||||||
|
ok = False
|
||||||
|
if address.startswith("#"):
|
||||||
|
ok = False
|
||||||
|
if "Autor:" in address:
|
||||||
|
ok = False
|
||||||
|
if ok:
|
||||||
|
links.append(link.attrib['href'])
|
||||||
|
return links
|
||||||
|
|
||||||
|
def getAuthorBasicInfo(authorname, authorlink):
|
||||||
|
info = {}
|
||||||
|
data = urllib.request.urlopen(BASE_URL + authorlink).read()
|
||||||
|
q = PyQuery(data)
|
||||||
|
|
||||||
|
info["wiki"] = getAuthorWikiLink(q)
|
||||||
|
info["links"] = getAuthorLinksList(authorname, q)
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
# def getAuthorWikiInfo(authorinfo):
|
||||||
|
|
||||||
|
# # Nothing can be learned without wiki page
|
||||||
|
# if authorinfo["wiki"] is None:
|
||||||
|
# return authorinfo
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
|
||||||
|
# q = PyQuery(data)
|
||||||
|
|
||||||
|
# # Find the birth date
|
||||||
|
# body = q("#mw-content-text").text()
|
||||||
|
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
|
||||||
|
# if not result is None:
|
||||||
|
# authorinfo["birthyear"] = result.group(0)
|
||||||
|
|
||||||
|
# except urllib.error.HTTPError:
|
||||||
|
# pass
|
||||||
|
|
||||||
|
# return authorinfo
|
||||||
|
|
||||||
|
def getText(url):
|
||||||
|
data = urllib.request.urlopen(BASE_URL + url).read()
|
||||||
|
q = PyQuery(data)
|
||||||
|
|
||||||
|
texttitle = q("h1").text()
|
||||||
|
|
||||||
|
body = q("#mw-content-text")
|
||||||
|
body.find("table").remove()
|
||||||
|
|
||||||
|
textcontent = body.text()
|
||||||
|
return (texttitle, textcontent)
|
||||||
|
|
||||||
|
def addAuthorToDb(authorinfo):
|
||||||
|
con = sqlite3.connect("data/texts.db")
|
||||||
|
c = con.cursor()
|
||||||
|
c.execute("INSERT INTO Authors")
|
||||||
|
|
||||||
|
def getAllTexts():
|
||||||
|
|
||||||
|
con = sqlite3.connect("data/texts.db")
|
||||||
|
c = con.cursor()
|
||||||
|
#c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
|
||||||
|
id = 1
|
||||||
|
|
||||||
|
authors = getAuthorList()
|
||||||
|
for authorname, authorlink in authors:
|
||||||
|
print("Processing author", authorname)
|
||||||
|
authorinfo = getAuthorBasicInfo(authorname, authorlink)
|
||||||
|
c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
|
||||||
|
|
||||||
|
# authorinfo = getAuthorWikiInfo(authorinfo)
|
||||||
|
for text in authorinfo["links"]:
|
||||||
|
try:
|
||||||
|
title, content = getText(text)
|
||||||
|
c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
|
||||||
|
c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
|
||||||
|
id = id + 1
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
con.commit()
|
||||||
|
|
||||||
|
getAllTexts()
|
0
src/ttl/__init__.py
Normal file
0
src/ttl/__init__.py
Normal file
62
src/ttl/ttlparser.py
Normal file
62
src/ttl/ttlparser.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
'''
|
||||||
|
Created on May 22, 2016
|
||||||
|
|
||||||
|
@author: tibi
|
||||||
|
'''
|
||||||
|
|
||||||
|
from xml.dom import minidom;
|
||||||
|
from xml.parsers.expat import ExpatError
|
||||||
|
from model.Word import Word
|
||||||
|
|
||||||
|
def parseText(xmlText):
|
||||||
|
|
||||||
|
words = []
|
||||||
|
chunks = {}
|
||||||
|
|
||||||
|
sentence_i = 0
|
||||||
|
|
||||||
|
# get the root "segs" element
|
||||||
|
try:
|
||||||
|
dom = minidom.parseString(xmlText)
|
||||||
|
except ExpatError as e:
|
||||||
|
print("Error in text:", xmlText)
|
||||||
|
print(e)
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
|
alltext = dom.getElementsByTagName("segs")
|
||||||
|
|
||||||
|
# iterate paragraphs
|
||||||
|
for paragraph in alltext[0].getElementsByTagName("seg"):
|
||||||
|
|
||||||
|
# iterate sentences
|
||||||
|
for sentence in paragraph.getElementsByTagName("s"):
|
||||||
|
|
||||||
|
# increment sentence index
|
||||||
|
sentence_i += 1
|
||||||
|
word_i = 0
|
||||||
|
|
||||||
|
# iterate words
|
||||||
|
for word in sentence.getElementsByTagName("w"):
|
||||||
|
|
||||||
|
# increment word index
|
||||||
|
word_i += 1
|
||||||
|
|
||||||
|
# obtain word info
|
||||||
|
wordText = word.firstChild.data
|
||||||
|
lemma = word.getAttribute("lemma")
|
||||||
|
ana = word.getAttribute("ana")
|
||||||
|
chunk = word.getAttribute("chunk")
|
||||||
|
|
||||||
|
# create word
|
||||||
|
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
|
||||||
|
#words.append(w)
|
||||||
|
|
||||||
|
for c in chunk.split(","):
|
||||||
|
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
|
||||||
|
words.append(w)
|
||||||
|
if chunks.get((sentence_i, c)) == None:
|
||||||
|
chunks[(sentence_i, c)] = [ w ]
|
||||||
|
else:
|
||||||
|
chunks[(sentence_i, c)].append(w)
|
||||||
|
|
||||||
|
return (words, chunks)
|
34
src/ttl/ttlservice.py
Normal file
34
src/ttl/ttlservice.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
import zeep
|
||||||
|
|
||||||
|
def executeTtl(text):
|
||||||
|
# Preprocess the text
|
||||||
|
text = text.replace(u'ĭ', 'i')
|
||||||
|
text = text.replace(u'ŭ', 'u')
|
||||||
|
text = text.replace(u'à', 'a')
|
||||||
|
|
||||||
|
client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
|
||||||
|
textSgml = client.service.UTF8toSGML(text)
|
||||||
|
result = client.service.XCES("ro", "id", textSgml)
|
||||||
|
|
||||||
|
# Cleanup result - generate valid xml
|
||||||
|
result = result.replace('’', '`')
|
||||||
|
result = result.replace('ă', u'ă')
|
||||||
|
result = result.replace('à', u'à')
|
||||||
|
result = result.replace('â', u'â')
|
||||||
|
result = result.replace('î', u'î')
|
||||||
|
result = result.replace('ş', u'ș')
|
||||||
|
result = result.replace('ţ', u'ț')
|
||||||
|
result = result.replace('ŭ', u'u')
|
||||||
|
result = result.replace('Ă', u'Ă')
|
||||||
|
result = result.replace('À', u'À')
|
||||||
|
result = result.replace('Â', u'Â')
|
||||||
|
result = result.replace('Î', u'Î')
|
||||||
|
result = result.replace('Ş', u'Ș')
|
||||||
|
result = result.replace('Ţ', u'Ț')
|
||||||
|
result = result.replace('Ŭ', u'U')
|
||||||
|
|
||||||
|
xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
|
||||||
|
xmlResult += result
|
||||||
|
xmlResult += "</segs>"
|
||||||
|
return xmlResult
|
Loading…
Reference in New Issue
Block a user