Refactored code; organized letter and word metrics
This commit is contained in:
parent
806d9cdedc
commit
64caeab14e
BIN
data/results.db
BIN
data/results.db
Binary file not shown.
@ -1,17 +1,17 @@
|
|||||||
# Fdb version 3
|
# Fdb version 3
|
||||||
["bibtex AuthorshipDetection"] 1495653012 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1495653014
|
["bibtex AuthorshipDetection"] 1496507489 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1496507491
|
||||||
"AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a ""
|
"AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a ""
|
||||||
"C:/Program Files/MiKTeX 2.9/bibtex/bst/bibtex/plain.bst" 1291868336 20613 bd3fbfa9f64872b81ac57a0dd2ed855f ""
|
"C:/Program Files/MiKTeX 2.9/bibtex/bst/bibtex/plain.bst" 1291868336 20613 bd3fbfa9f64872b81ac57a0dd2ed855f ""
|
||||||
"bibliography/bibliography.bib" 1495652619 244 59c2a4b6607d9d24b3cfb7d66faed6a6 ""
|
"bibliography/bibliography.bib" 1495652619 244 59c2a4b6607d9d24b3cfb7d66faed6a6 ""
|
||||||
(generated)
|
(generated)
|
||||||
"AuthorshipDetection.bbl"
|
"AuthorshipDetection.bbl"
|
||||||
"AuthorshipDetection.blg"
|
"AuthorshipDetection.blg"
|
||||||
["pdflatex"] 1495653013 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1495653014
|
["pdflatex"] 1496507489 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1496507491
|
||||||
"AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a ""
|
"AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a ""
|
||||||
"AuthorshipDetection.bbl" 1495653013 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection"
|
"AuthorshipDetection.bbl" 1496507489 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection"
|
||||||
"AuthorshipDetection.tdo" 1495653014 149 5ef486a5a64ba9d20e77900a17e38ab8 ""
|
"AuthorshipDetection.tdo" 1496507490 149 5ef486a5a64ba9d20e77900a17e38ab8 ""
|
||||||
"AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
|
"AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
|
||||||
"AuthorshipDetection.toc" 1495653014 354 93710fd8d0aa2019c18a188df168978a ""
|
"AuthorshipDetection.toc" 1496507490 354 93710fd8d0aa2019c18a188df168978a ""
|
||||||
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx10.tfm" 1136768653 1328 c834bbb027764024c09d3d2bf908b5f0 ""
|
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx10.tfm" 1136768653 1328 c834bbb027764024c09d3d2bf908b5f0 ""
|
||||||
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx12.tfm" 1136768653 1324 c910af8c371558dc20f2d7822f66fe64 ""
|
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx12.tfm" 1136768653 1324 c910af8c371558dc20f2d7822f66fe64 ""
|
||||||
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmmi12.tfm" 1136768653 1524 4414a8315f39513458b80dfc63bff03a ""
|
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmmi12.tfm" 1136768653 1524 4414a8315f39513458b80dfc63bff03a ""
|
||||||
@ -126,15 +126,15 @@
|
|||||||
"C:/Program Files/MiKTeX 2.9/tex/latex/tools/calc.sty" 1492423194 10503 d03d065f799d54f6b7e9b175f8d84279 ""
|
"C:/Program Files/MiKTeX 2.9/tex/latex/tools/calc.sty" 1492423194 10503 d03d065f799d54f6b7e9b175f8d84279 ""
|
||||||
"C:/Program Files/MiKTeX 2.9/tex/latex/xcolor/xcolor.sty" 1463135581 57049 34128738f682d033422ca125f82e5d62 ""
|
"C:/Program Files/MiKTeX 2.9/tex/latex/xcolor/xcolor.sty" 1463135581 57049 34128738f682d033422ca125f82e5d62 ""
|
||||||
"C:/Program Files/MiKTeX 2.9/tex/latex/xkeyval/xkeyval.sty" 1419274338 5114 9c1069474ff71dbc47d5006555e352d3 ""
|
"C:/Program Files/MiKTeX 2.9/tex/latex/xkeyval/xkeyval.sty" 1419274338 5114 9c1069474ff71dbc47d5006555e352d3 ""
|
||||||
"C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495650797 80423 3cfba73105275d3ad410701d89fbdf7a ""
|
"C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495981403 80423 3cfba73105275d3ad410701d89fbdf7a ""
|
||||||
"C:/Users/Tibi/AppData/Local/MiKTeX/2.9/miktex/data/le/pdftex/pdflatex.fmt" 1495550378 4001358 752f924a412e4944af4e01c5a9beae77 ""
|
"C:/Users/Tibi/AppData/Local/MiKTeX/2.9/miktex/data/le/pdftex/pdflatex.fmt" 1495550378 4001358 752f924a412e4944af4e01c5a9beae77 ""
|
||||||
"chapters/abstract.tex" 1495553267 495 58ad3ecfec349d84d898cef65bea34a8 ""
|
"chapters/abstract.tex" 1495553267 495 58ad3ecfec349d84d898cef65bea34a8 ""
|
||||||
"chapters/introduction.tex" 1495651567 1996 d9cf7ce732c566423ec6b6f8f56fcd7e ""
|
"chapters/introduction.tex" 1495651567 1996 d9cf7ce732c566423ec6b6f8f56fcd7e ""
|
||||||
"chapters/previouswork.tex" 1495652541 29 112f1954c35a5d96c415f13d34bcd056 ""
|
"chapters/previouswork.tex" 1495652541 29 112f1954c35a5d96c415f13d34bcd056 ""
|
||||||
"d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
|
"d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
|
||||||
(generated)
|
(generated)
|
||||||
|
"AuthorshipDetection.pdf"
|
||||||
|
"AuthorshipDetection.log"
|
||||||
|
"AuthorshipDetection.tdo"
|
||||||
"AuthorshipDetection.aux"
|
"AuthorshipDetection.aux"
|
||||||
"AuthorshipDetection.toc"
|
"AuthorshipDetection.toc"
|
||||||
"AuthorshipDetection.log"
|
|
||||||
"AuthorshipDetection.pdf"
|
|
||||||
"AuthorshipDetection.tdo"
|
|
||||||
|
Binary file not shown.
BIN
papers/AuthorshipDetection.synctex.gz
Normal file
BIN
papers/AuthorshipDetection.synctex.gz
Normal file
Binary file not shown.
Binary file not shown.
11
src/exec.py
Normal file
11
src/exec.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
print("Acquiring texts...")
|
||||||
|
import step0_acquire.wikisource_downloader
|
||||||
|
|
||||||
|
print("Processing letter frequencies... ")
|
||||||
|
import step1_text_processing.process_letter_frequencies
|
||||||
|
|
||||||
|
print("Processing word frequencies... ")
|
||||||
|
import step1_text_processing.process_word_frequencies
|
||||||
|
|
||||||
|
print("Processing word lengths... ")
|
||||||
|
import step1_text_processing.process_word_lengths
|
40
src/main.py
40
src/main.py
@ -1,40 +0,0 @@
|
|||||||
import logging
|
|
||||||
import time
|
|
||||||
# own
|
|
||||||
import logger
|
|
||||||
import storage.data
|
|
||||||
import storage.results
|
|
||||||
import textprocessor.letterfreq
|
|
||||||
import ttl.ttlparser
|
|
||||||
import ttl.ttlservice
|
|
||||||
|
|
||||||
def init():
|
|
||||||
logger.init_logger(logging.WARNING)
|
|
||||||
storage.data.initializeFragmentDatabase("data/texts.db")
|
|
||||||
storage.results.initializeResultsDatabase("data/results.db", True)
|
|
||||||
|
|
||||||
def processTexts():
|
|
||||||
count = storage.data.getTextCount()
|
|
||||||
current = 0
|
|
||||||
for item in storage.data.getAllTexts():
|
|
||||||
print("Processing item", current, "out of", count)
|
|
||||||
current = current + 1
|
|
||||||
|
|
||||||
itemid = item[0]
|
|
||||||
itemtext = item[1]
|
|
||||||
|
|
||||||
# obtain ttl analysis
|
|
||||||
# unfeasable - it takes 5-10 minutes for a single text
|
|
||||||
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
|
|
||||||
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
|
|
||||||
# storage.results.storeTtlAnalysis(itemid, words)
|
|
||||||
|
|
||||||
# perform analysis
|
|
||||||
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
|
|
||||||
storage.results.storeFrequencies(itemid, letterFreq)
|
|
||||||
|
|
||||||
print("Finished!")
|
|
||||||
|
|
||||||
init()
|
|
||||||
processTexts()
|
|
||||||
|
|
@ -10,10 +10,12 @@ def getAuthorList():
|
|||||||
authors = []
|
authors = []
|
||||||
for letter in LETTERS:
|
for letter in LETTERS:
|
||||||
print("Processing link page for letter", letter)
|
print("Processing link page for letter", letter)
|
||||||
|
|
||||||
# Read index page
|
# Read index page
|
||||||
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
|
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
|
||||||
data = urllib.request.urlopen(url).read()
|
data = urllib.request.urlopen(url).read()
|
||||||
q = PyQuery(data)
|
q = PyQuery(data)
|
||||||
|
|
||||||
for item in q("div.mw-category-generated").find("a"):
|
for item in q("div.mw-category-generated").find("a"):
|
||||||
if (item.text.startswith("Autor:")):
|
if (item.text.startswith("Autor:")):
|
||||||
authorname = item.text[6:]
|
authorname = item.text[6:]
|
||||||
@ -62,27 +64,6 @@ def getAuthorBasicInfo(authorname, authorlink):
|
|||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
# def getAuthorWikiInfo(authorinfo):
|
|
||||||
|
|
||||||
# # Nothing can be learned without wiki page
|
|
||||||
# if authorinfo["wiki"] is None:
|
|
||||||
# return authorinfo
|
|
||||||
|
|
||||||
# try:
|
|
||||||
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
|
|
||||||
# q = PyQuery(data)
|
|
||||||
|
|
||||||
# # Find the birth date
|
|
||||||
# body = q("#mw-content-text").text()
|
|
||||||
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
|
|
||||||
# if not result is None:
|
|
||||||
# authorinfo["birthyear"] = result.group(0)
|
|
||||||
|
|
||||||
# except urllib.error.HTTPError:
|
|
||||||
# pass
|
|
||||||
|
|
||||||
# return authorinfo
|
|
||||||
|
|
||||||
def getText(url):
|
def getText(url):
|
||||||
data = urllib.request.urlopen(BASE_URL + url).read()
|
data = urllib.request.urlopen(BASE_URL + url).read()
|
||||||
q = PyQuery(data)
|
q = PyQuery(data)
|
0
src/step1_text_processing/__init__.py
Normal file
0
src/step1_text_processing/__init__.py
Normal file
@ -1,5 +1,14 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
import operator
|
import operator
|
||||||
import storage
|
# own
|
||||||
|
from storage.data import TextStorage
|
||||||
|
from storage.results.letterFrequencies import LetterFrequencyStorage
|
||||||
|
|
||||||
|
import ttl.ttlparser
|
||||||
|
import ttl.ttlservice
|
||||||
|
|
||||||
|
FREQUENCY_TRESHOLD = 0.005
|
||||||
|
|
||||||
def letterFrequencies(text):
|
def letterFrequencies(text):
|
||||||
letterfreq = [{}, {}, {}, {}]
|
letterfreq = [{}, {}, {}, {}]
|
||||||
@ -31,8 +40,31 @@ def letterFrequencies(text):
|
|||||||
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
|
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
|
||||||
for i in range(4):
|
for i in range(4):
|
||||||
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
|
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
|
||||||
freqFiltered = freqSorted[0:50]
|
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqSorted]
|
||||||
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
|
freqFiltered = [(symbol, freq) for symbol, freq in freqNormalized if freq >= FREQUENCY_TRESHOLD]
|
||||||
letterfreq[i] = freqNormalized
|
letterfreq[i] = freqNormalized
|
||||||
|
|
||||||
return letterfreq
|
return letterfreq
|
||||||
|
|
||||||
|
def processTexts(TextStorage, resultsStorage):
|
||||||
|
count = TextStorage.getTextCount()
|
||||||
|
current = 0
|
||||||
|
for item in TextStorage.getAllTexts():
|
||||||
|
print("Processing item", current, "out of", count)
|
||||||
|
current = current + 1
|
||||||
|
|
||||||
|
itemid = item[0]
|
||||||
|
itemtext = item[1]
|
||||||
|
|
||||||
|
# perform analysis
|
||||||
|
letterFreq = letterFrequencies(itemtext)
|
||||||
|
resultsStorage.store(itemid, letterFreq)
|
||||||
|
|
||||||
|
print("Finished!")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
TextStorage = TextStorage("data/texts.db")
|
||||||
|
resultsStorage = LetterFrequencyStorage("data/results.db")
|
||||||
|
processTexts(TextStorage, resultsStorage)
|
||||||
|
|
||||||
|
main()
|
70
src/step1_text_processing/process_word_frequencies.py
Normal file
70
src/step1_text_processing/process_word_frequencies.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import operator
|
||||||
|
import nltk.tokenize
|
||||||
|
import nltk.tokenize.moses
|
||||||
|
import nltk.stem.snowball
|
||||||
|
|
||||||
|
# own
|
||||||
|
from storage.data import TextStorage
|
||||||
|
from storage.results.wordFrequencies import WordFrequencyStorage
|
||||||
|
import textutils
|
||||||
|
|
||||||
|
FREQUENCY_TRESHOLD = 0.001
|
||||||
|
|
||||||
|
def wordFrequencies(text):
|
||||||
|
text = textutils.fixDiacritics(text)
|
||||||
|
tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
|
||||||
|
stemmer = nltk.stem.snowball.RomanianStemmer()
|
||||||
|
|
||||||
|
words = tokenizer.tokenize(text)
|
||||||
|
frequencies = {}
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
# Skip non-words
|
||||||
|
if not textutils.isValidWord(word):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# use word stem
|
||||||
|
stem = stemmer.stem(word)
|
||||||
|
|
||||||
|
if stem not in frequencies:
|
||||||
|
frequencies[stem] = 1
|
||||||
|
else:
|
||||||
|
frequencies[stem] += 1
|
||||||
|
|
||||||
|
# Normalize
|
||||||
|
freqSorted = sorted(frequencies.items(), key=operator.itemgetter(1), reverse=True)
|
||||||
|
|
||||||
|
freqNormalized = []
|
||||||
|
for word, freq in freqSorted:
|
||||||
|
freqNorm = float(freq) / len(words)
|
||||||
|
if freqNorm >= FREQUENCY_TRESHOLD:
|
||||||
|
freqNormalized.append((word, freqNorm))
|
||||||
|
|
||||||
|
return freqNormalized
|
||||||
|
|
||||||
|
def processTexts(TextStorage, resultsStorage):
|
||||||
|
count = TextStorage.getTextCount()
|
||||||
|
current = 0
|
||||||
|
for item in TextStorage.getAllTexts():
|
||||||
|
print("Processing item", current, "out of", count)
|
||||||
|
current = current + 1
|
||||||
|
|
||||||
|
itemid = item[0]
|
||||||
|
itemtext = item[1]
|
||||||
|
|
||||||
|
# perform analysis
|
||||||
|
letterFreq = wordFrequencies(itemtext)
|
||||||
|
|
||||||
|
# store results
|
||||||
|
resultsStorage.store(itemid, letterFreq)
|
||||||
|
|
||||||
|
print("Finished!")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
TextStorage = TextStorage("data/texts.db")
|
||||||
|
resultsStorage = WordFrequencyStorage("data/results.db")
|
||||||
|
processTexts(TextStorage, resultsStorage)
|
||||||
|
|
||||||
|
main()
|
60
src/step1_text_processing/process_word_lengths.py
Normal file
60
src/step1_text_processing/process_word_lengths.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import operator
|
||||||
|
import nltk.tokenize
|
||||||
|
import nltk.tokenize.moses
|
||||||
|
import nltk.stem.snowball
|
||||||
|
|
||||||
|
# own
|
||||||
|
from storage.data import TextStorage
|
||||||
|
from storage.results.wordLengths import WordLengthStorage
|
||||||
|
import textutils
|
||||||
|
|
||||||
|
def wordLengths(text):
|
||||||
|
text = textutils.fixDiacritics(text)
|
||||||
|
tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
|
||||||
|
words = tokenizer.tokenize(text)
|
||||||
|
|
||||||
|
lengths = {}
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
# Skip non-words
|
||||||
|
if not textutils.isValidWord(word):
|
||||||
|
continue
|
||||||
|
|
||||||
|
l = len(word)
|
||||||
|
|
||||||
|
if l not in lengths:
|
||||||
|
lengths[l] = 1
|
||||||
|
else:
|
||||||
|
lengths[l] += 1
|
||||||
|
|
||||||
|
# normalize
|
||||||
|
norm_lengths = [(length, float(freq) / len(words)) for length, freq in lengths.items()]
|
||||||
|
|
||||||
|
return norm_lengths
|
||||||
|
|
||||||
|
def processTexts(TextStorage, resultsStorage):
|
||||||
|
count = TextStorage.getTextCount()
|
||||||
|
current = 0
|
||||||
|
for item in TextStorage.getAllTexts():
|
||||||
|
print("Processing item", current, "out of", count)
|
||||||
|
current = current + 1
|
||||||
|
|
||||||
|
itemid = item[0]
|
||||||
|
itemtext = item[1]
|
||||||
|
|
||||||
|
# perform analysis
|
||||||
|
wordLens = wordLengths(itemtext)
|
||||||
|
|
||||||
|
# store results
|
||||||
|
resultsStorage.store(itemid, wordLens)
|
||||||
|
|
||||||
|
print("Finished!")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
TextStorage = TextStorage("data/texts.db")
|
||||||
|
resultsStorage = WordLengthStorage("data/results.db")
|
||||||
|
processTexts(TextStorage, resultsStorage)
|
||||||
|
|
||||||
|
main()
|
@ -0,0 +1,33 @@
|
|||||||
|
import os.path
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
class Storage:
|
||||||
|
def __init__(self, dbFile):
|
||||||
|
self.__dbFile = dbFile
|
||||||
|
self.__initialize()
|
||||||
|
self.__con = None
|
||||||
|
self.__cur = None
|
||||||
|
|
||||||
|
def __initialize(self):
|
||||||
|
self._createDatabase()
|
||||||
|
|
||||||
|
def _createDatabase(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _destroyDatabase(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
self.__con = sqlite3.connect(self.__dbFile)
|
||||||
|
self.__cur = self.__con.cursor()
|
||||||
|
return self.__cur
|
||||||
|
|
||||||
|
def commit(self, doClose=True):
|
||||||
|
self.__con.commit()
|
||||||
|
if doClose:
|
||||||
|
self.__cur.close()
|
||||||
|
self.__con.close()
|
||||||
|
|
||||||
|
def recreateDatabase(self):
|
||||||
|
self._destroyDatabase()
|
||||||
|
self._createDatabase()
|
@ -1,80 +0,0 @@
|
|||||||
import logging
|
|
||||||
import os
|
|
||||||
from model import *
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
log = logging.getLogger("storage")
|
|
||||||
|
|
||||||
DB_FRAGMENTS = ""
|
|
||||||
|
|
||||||
# Commands
|
|
||||||
|
|
||||||
# birth location - general area, not exact location (i.e. Transylvania)
|
|
||||||
# birth origin - rural or urban
|
|
||||||
# studies - masters, bachelors, high school, middle school, primary school
|
|
||||||
# occupation - comma separated if there are multiple
|
|
||||||
# studiesAbroad - foreign cities where author studied (comma separated)
|
|
||||||
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
|
|
||||||
name TEXT PRIMARY KEY,
|
|
||||||
birthYear INTEGER,
|
|
||||||
birthLocation TEXT,
|
|
||||||
birthOrigin TEXT,
|
|
||||||
studies TEXT,
|
|
||||||
occupations TEXT,
|
|
||||||
studiesAbroad TEXT
|
|
||||||
)"""
|
|
||||||
|
|
||||||
# genre - short story (nuvela), novel (roman), poem etc
|
|
||||||
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
|
|
||||||
# tags - other relevant information (i.e. psychological)
|
|
||||||
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
title TEXT,
|
|
||||||
year INTEGER,
|
|
||||||
author TEXT REFERENCES Authors(name),
|
|
||||||
genre TEXT,
|
|
||||||
movement TEXT,
|
|
||||||
tags TEXT
|
|
||||||
)"""
|
|
||||||
|
|
||||||
# contains the actual text
|
|
||||||
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
|
|
||||||
id INTEGER REFERENCES Fragments(id),
|
|
||||||
content TEXT
|
|
||||||
)"""
|
|
||||||
|
|
||||||
# Initialize databases
|
|
||||||
def initializeFragmentDatabase(dbFile):
|
|
||||||
global DB_FRAGMENTS
|
|
||||||
DB_FRAGMENTS = dbFile
|
|
||||||
|
|
||||||
if not os.path.exists(dbFile):
|
|
||||||
log.info("Text database %s not found. Will create database.", dbFile)
|
|
||||||
con = sqlite3.connect(dbFile)
|
|
||||||
c = con.cursor()
|
|
||||||
c.execute(COMMAND_CREATE_AUTHORS)
|
|
||||||
c.execute(COMMAND_CREATE_FRAGMENTS)
|
|
||||||
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
|
|
||||||
con.commit()
|
|
||||||
con.close()
|
|
||||||
log.info("Database created!")
|
|
||||||
|
|
||||||
def getTextCount():
|
|
||||||
con = sqlite3.connect(DB_FRAGMENTS)
|
|
||||||
c = con.cursor()
|
|
||||||
c.execute("SELECT COUNT(*) FROM Fragments")
|
|
||||||
item = c.fetchone()
|
|
||||||
c.close()
|
|
||||||
con.close()
|
|
||||||
return item[0]
|
|
||||||
|
|
||||||
def getAllTexts():
|
|
||||||
con = sqlite3.connect(DB_FRAGMENTS)
|
|
||||||
c = con.cursor()
|
|
||||||
c.execute("SELECT id, content FROM FragmentsContent")
|
|
||||||
|
|
||||||
items = c.fetchall()
|
|
||||||
|
|
||||||
c.close()
|
|
||||||
con.close()
|
|
||||||
return items
|
|
@ -1,84 +0,0 @@
|
|||||||
import logging
|
|
||||||
import os
|
|
||||||
from model.Word import *
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
log = logging.getLogger("storage")
|
|
||||||
|
|
||||||
DB_RESULTS = ""
|
|
||||||
|
|
||||||
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
|
|
||||||
idtext INTEGER,
|
|
||||||
lettergroup TEXT,
|
|
||||||
category TEXT,
|
|
||||||
frequency REAL
|
|
||||||
)"""
|
|
||||||
|
|
||||||
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
|
|
||||||
idtext INTEGER,
|
|
||||||
wordIndex INTEGER,
|
|
||||||
sentenceIndex INTEGER,
|
|
||||||
word TEXT,
|
|
||||||
lemma TEXT,
|
|
||||||
analysis TEXT,
|
|
||||||
chunk TEXT
|
|
||||||
)"""
|
|
||||||
|
|
||||||
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
|
|
||||||
# idtext INTEGER,
|
|
||||||
# wordlength INTEGER,
|
|
||||||
# frequency REAL
|
|
||||||
# )"""
|
|
||||||
|
|
||||||
def initializeResultsDatabase(dbFile, cleanupOldData):
|
|
||||||
global DB_RESULTS
|
|
||||||
DB_RESULTS = dbFile
|
|
||||||
|
|
||||||
# cleanup old data
|
|
||||||
if cleanupOldData:
|
|
||||||
con = sqlite3.connect(DB_RESULTS)
|
|
||||||
c = con.cursor()
|
|
||||||
|
|
||||||
try:
|
|
||||||
c.execute("DROP TABLE LetterFrequencies")
|
|
||||||
except sqlite3.OperationalError:
|
|
||||||
pass
|
|
||||||
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
|
|
||||||
|
|
||||||
try:
|
|
||||||
c.execute("DROP TABLE TextWords")
|
|
||||||
except sqlite3.OperationalError:
|
|
||||||
pass
|
|
||||||
c.execute(COMMAND_CREATE_TEXT_WORDS)
|
|
||||||
|
|
||||||
con.commit()
|
|
||||||
c.close()
|
|
||||||
con.close()
|
|
||||||
|
|
||||||
|
|
||||||
def storeFrequencies(idtext, freq):
|
|
||||||
con = sqlite3.connect(DB_RESULTS)
|
|
||||||
c = con.cursor()
|
|
||||||
|
|
||||||
# add data
|
|
||||||
chr = ['p', 'l1', 'l2', 'l3']
|
|
||||||
for i in range(4):
|
|
||||||
for let, fr in freq[i]:
|
|
||||||
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
|
|
||||||
|
|
||||||
con.commit()
|
|
||||||
c.close()
|
|
||||||
con.close()
|
|
||||||
|
|
||||||
def storeTtlAnalysis(idtext, words):
|
|
||||||
con = sqlite3.connect(DB_RESULTS)
|
|
||||||
c = con.cursor()
|
|
||||||
|
|
||||||
# store words
|
|
||||||
for word in words:
|
|
||||||
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
|
|
||||||
|
|
||||||
# finish
|
|
||||||
con.commit()
|
|
||||||
c.close()
|
|
||||||
con.close()
|
|
0
src/storage/results/__init__.py
Normal file
0
src/storage/results/__init__.py
Normal file
27
src/storage/results/letterFrequencies.py
Normal file
27
src/storage/results/letterFrequencies.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import storage
|
||||||
|
|
||||||
|
class LetterFrequencyStorage(storage.Storage):
|
||||||
|
__COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE IF NOT EXISTS LetterFrequencies (
|
||||||
|
idtext INTEGER,
|
||||||
|
lettergroup TEXT,
|
||||||
|
category TEXT,
|
||||||
|
frequency REAL
|
||||||
|
)"""
|
||||||
|
|
||||||
|
def _createDatabase(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute(self.__COMMAND_CREATE_LETTER_FREQUENCIES)
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
def _destroyDatabase(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute('DROP TABLE IF EXISTS LetterFrequencies')
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
def store(self, idtext, frequencies):
|
||||||
|
c = self.connect()
|
||||||
|
chr = ['p', 'l1', 'l2', 'l3']
|
||||||
|
for i in range(4):
|
||||||
|
for let, fr in frequencies[i]:
|
||||||
|
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
|
||||||
|
self.commit()
|
24
src/storage/results/wordFrequencies.py
Normal file
24
src/storage/results/wordFrequencies.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import storage
|
||||||
|
|
||||||
|
class WordFrequencyStorage(storage.Storage):
|
||||||
|
__COMMAND_CREATE_WORD_FREQUENCIES = """CREATE TABLE IF NOT EXISTS WordFrequencies (
|
||||||
|
idtext INTEGER,
|
||||||
|
word TEXT,
|
||||||
|
frequency REAL
|
||||||
|
)"""
|
||||||
|
|
||||||
|
def _createDatabase(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute(self.__COMMAND_CREATE_WORD_FREQUENCIES)
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
def _destroyDatabase(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute('DROP TABLE IF EXISTS WordFrequencies')
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
def store(self, idtext, frequencies):
|
||||||
|
c = self.connect()
|
||||||
|
for word, freq in frequencies:
|
||||||
|
c.execute('INSERT INTO WordFrequencies VALUES(?, ?, ?)', (idtext, word, freq))
|
||||||
|
self.commit()
|
24
src/storage/results/wordLengths.py
Normal file
24
src/storage/results/wordLengths.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import storage
|
||||||
|
|
||||||
|
class WordLengthStorage(storage.Storage):
|
||||||
|
__COMMAND_CREATE_WORD_LENGTHS = """CREATE TABLE IF NOT EXISTS WordLengths (
|
||||||
|
idtext INTEGER,
|
||||||
|
wordlength INTEGER,
|
||||||
|
frequency REAL
|
||||||
|
)"""
|
||||||
|
|
||||||
|
def _createDatabase(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute(self.__COMMAND_CREATE_WORD_LENGTHS)
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
def _destroyDatabase(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute('DROP TABLE IF EXISTS WordLengths')
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
def store(self, idtext, frequencies):
|
||||||
|
c = self.connect()
|
||||||
|
for length, frequency in frequencies:
|
||||||
|
c.execute("INSERT INTO WordLengths VALUES(?, ?, ?)", (idtext, length, frequency))
|
||||||
|
self.commit()
|
65
src/storage/texts.py
Normal file
65
src/storage/texts.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
import storage
|
||||||
|
|
||||||
|
class TextStorage(storage.Storage):
|
||||||
|
|
||||||
|
# birth location - general area, not exact location (i.e. Transylvania)
|
||||||
|
# birth origin - rural or urban
|
||||||
|
# studies - masters, bachelors, high school, middle school, primary school
|
||||||
|
# occupation - comma separated if there are multiple
|
||||||
|
# studiesAbroad - foreign cities where author studied (comma separated)
|
||||||
|
__COMMAND_CREATE_AUTHORS = """CREATE TABLE IF NOT EXISTS Authors (
|
||||||
|
name TEXT PRIMARY KEY,
|
||||||
|
birthYear INTEGER,
|
||||||
|
birthLocation TEXT,
|
||||||
|
birthOrigin TEXT,
|
||||||
|
studies TEXT,
|
||||||
|
occupations TEXT,
|
||||||
|
studiesAbroad TEXT
|
||||||
|
)"""
|
||||||
|
|
||||||
|
# genre - short story (nuvela), novel (roman), poem etc
|
||||||
|
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
|
||||||
|
# tags - other relevant information (i.e. psychological)
|
||||||
|
__COMMAND_CREATE_FRAGMENTS = """CREATE TABLE IF NOT EXISTS Fragments (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
title TEXT,
|
||||||
|
year INTEGER,
|
||||||
|
author TEXT REFERENCES Authors(name),
|
||||||
|
genre TEXT,
|
||||||
|
movement TEXT,
|
||||||
|
tags TEXT
|
||||||
|
)"""
|
||||||
|
|
||||||
|
# contains the actual text
|
||||||
|
__COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE IF NOT EXISTS FragmentsContent (
|
||||||
|
id INTEGER REFERENCES Fragments(id),
|
||||||
|
content TEXT
|
||||||
|
)"""
|
||||||
|
|
||||||
|
def _createDatabase(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute(self.__COMMAND_CREATE_AUTHORS)
|
||||||
|
c.execute(self.__COMMAND_CREATE_FRAGMENTS)
|
||||||
|
c.execute(self.__COMMAND_CREATE_FRAGMENTS_CONTENT)
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
def _destroyDatabase(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute('DROP TABLE IF EXISTS Authors')
|
||||||
|
c.execute('DROP TABLE IF EXISTS Fragments')
|
||||||
|
c.execute('DROP TABLE IF EXISTS FragmentsContent')
|
||||||
|
self.commit()
|
||||||
|
|
||||||
|
def getTextCount(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute("SELECT COUNT(*) FROM Fragments")
|
||||||
|
item = c.fetchone()
|
||||||
|
self.commit()
|
||||||
|
return item[0]
|
||||||
|
|
||||||
|
def getAllTexts(self):
|
||||||
|
c = self.connect()
|
||||||
|
c.execute("SELECT id, content FROM FragmentsContent")
|
||||||
|
items = c.fetchall()
|
||||||
|
self.commit()
|
||||||
|
return items
|
@ -1,2 +0,0 @@
|
|||||||
def analyzeWords(text):
|
|
||||||
pass
|
|
16
src/textutils/__init__.py
Normal file
16
src/textutils/__init__.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
def fixDiacritics(text):
|
||||||
|
text = text.replace(u'ĭ', 'i')
|
||||||
|
text = text.replace(u'ŭ', 'u')
|
||||||
|
text = text.replace(u'à', 'a')
|
||||||
|
return text
|
||||||
|
|
||||||
|
def isValidWord(word):
|
||||||
|
# Alphanumeric => word
|
||||||
|
if word.isalnum():
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Some words might be contractions, which finish/begin with a '
|
||||||
|
if word[1:].isalnum() or word[:-1].isalnum():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
8
src/tools/test.php
Normal file
8
src/tools/test.php
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<?php echo "Hello world!"?>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in New Issue
Block a user