Refactored code; organized letter and word metrics
This commit is contained in:
parent
806d9cdedc
commit
64caeab14e
BIN
data/results.db
BIN
data/results.db
Binary file not shown.
@ -1,17 +1,17 @@
|
||||
# Fdb version 3
|
||||
["bibtex AuthorshipDetection"] 1495653012 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1495653014
|
||||
"AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a ""
|
||||
["bibtex AuthorshipDetection"] 1496507489 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1496507491
|
||||
"AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a ""
|
||||
"C:/Program Files/MiKTeX 2.9/bibtex/bst/bibtex/plain.bst" 1291868336 20613 bd3fbfa9f64872b81ac57a0dd2ed855f ""
|
||||
"bibliography/bibliography.bib" 1495652619 244 59c2a4b6607d9d24b3cfb7d66faed6a6 ""
|
||||
(generated)
|
||||
"AuthorshipDetection.bbl"
|
||||
"AuthorshipDetection.blg"
|
||||
["pdflatex"] 1495653013 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1495653014
|
||||
"AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a ""
|
||||
"AuthorshipDetection.bbl" 1495653013 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection"
|
||||
"AuthorshipDetection.tdo" 1495653014 149 5ef486a5a64ba9d20e77900a17e38ab8 ""
|
||||
["pdflatex"] 1496507489 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1496507491
|
||||
"AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a ""
|
||||
"AuthorshipDetection.bbl" 1496507489 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection"
|
||||
"AuthorshipDetection.tdo" 1496507490 149 5ef486a5a64ba9d20e77900a17e38ab8 ""
|
||||
"AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
|
||||
"AuthorshipDetection.toc" 1495653014 354 93710fd8d0aa2019c18a188df168978a ""
|
||||
"AuthorshipDetection.toc" 1496507490 354 93710fd8d0aa2019c18a188df168978a ""
|
||||
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx10.tfm" 1136768653 1328 c834bbb027764024c09d3d2bf908b5f0 ""
|
||||
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx12.tfm" 1136768653 1324 c910af8c371558dc20f2d7822f66fe64 ""
|
||||
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmmi12.tfm" 1136768653 1524 4414a8315f39513458b80dfc63bff03a ""
|
||||
@ -126,15 +126,15 @@
|
||||
"C:/Program Files/MiKTeX 2.9/tex/latex/tools/calc.sty" 1492423194 10503 d03d065f799d54f6b7e9b175f8d84279 ""
|
||||
"C:/Program Files/MiKTeX 2.9/tex/latex/xcolor/xcolor.sty" 1463135581 57049 34128738f682d033422ca125f82e5d62 ""
|
||||
"C:/Program Files/MiKTeX 2.9/tex/latex/xkeyval/xkeyval.sty" 1419274338 5114 9c1069474ff71dbc47d5006555e352d3 ""
|
||||
"C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495650797 80423 3cfba73105275d3ad410701d89fbdf7a ""
|
||||
"C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495981403 80423 3cfba73105275d3ad410701d89fbdf7a ""
|
||||
"C:/Users/Tibi/AppData/Local/MiKTeX/2.9/miktex/data/le/pdftex/pdflatex.fmt" 1495550378 4001358 752f924a412e4944af4e01c5a9beae77 ""
|
||||
"chapters/abstract.tex" 1495553267 495 58ad3ecfec349d84d898cef65bea34a8 ""
|
||||
"chapters/introduction.tex" 1495651567 1996 d9cf7ce732c566423ec6b6f8f56fcd7e ""
|
||||
"chapters/previouswork.tex" 1495652541 29 112f1954c35a5d96c415f13d34bcd056 ""
|
||||
"d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
|
||||
(generated)
|
||||
"AuthorshipDetection.pdf"
|
||||
"AuthorshipDetection.log"
|
||||
"AuthorshipDetection.tdo"
|
||||
"AuthorshipDetection.aux"
|
||||
"AuthorshipDetection.toc"
|
||||
"AuthorshipDetection.log"
|
||||
"AuthorshipDetection.pdf"
|
||||
"AuthorshipDetection.tdo"
|
||||
|
Binary file not shown.
BIN
papers/AuthorshipDetection.synctex.gz
Normal file
BIN
papers/AuthorshipDetection.synctex.gz
Normal file
Binary file not shown.
Binary file not shown.
11
src/exec.py
Normal file
11
src/exec.py
Normal file
@ -0,0 +1,11 @@
|
||||
print("Acquiring texts...")
|
||||
import step0_acquire.wikisource_downloader
|
||||
|
||||
print("Processing letter frequencies... ")
|
||||
import step1_text_processing.process_letter_frequencies
|
||||
|
||||
print("Processing word frequencies... ")
|
||||
import step1_text_processing.process_word_frequencies
|
||||
|
||||
print("Processing word lengths... ")
|
||||
import step1_text_processing.process_word_lengths
|
40
src/main.py
40
src/main.py
@ -1,40 +0,0 @@
|
||||
import logging
|
||||
import time
|
||||
# own
|
||||
import logger
|
||||
import storage.data
|
||||
import storage.results
|
||||
import textprocessor.letterfreq
|
||||
import ttl.ttlparser
|
||||
import ttl.ttlservice
|
||||
|
||||
def init():
|
||||
logger.init_logger(logging.WARNING)
|
||||
storage.data.initializeFragmentDatabase("data/texts.db")
|
||||
storage.results.initializeResultsDatabase("data/results.db", True)
|
||||
|
||||
def processTexts():
|
||||
count = storage.data.getTextCount()
|
||||
current = 0
|
||||
for item in storage.data.getAllTexts():
|
||||
print("Processing item", current, "out of", count)
|
||||
current = current + 1
|
||||
|
||||
itemid = item[0]
|
||||
itemtext = item[1]
|
||||
|
||||
# obtain ttl analysis
|
||||
# unfeasable - it takes 5-10 minutes for a single text
|
||||
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
|
||||
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
|
||||
# storage.results.storeTtlAnalysis(itemid, words)
|
||||
|
||||
# perform analysis
|
||||
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
|
||||
storage.results.storeFrequencies(itemid, letterFreq)
|
||||
|
||||
print("Finished!")
|
||||
|
||||
init()
|
||||
processTexts()
|
||||
|
@ -10,10 +10,12 @@ def getAuthorList():
|
||||
authors = []
|
||||
for letter in LETTERS:
|
||||
print("Processing link page for letter", letter)
|
||||
|
||||
# Read index page
|
||||
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
|
||||
data = urllib.request.urlopen(url).read()
|
||||
q = PyQuery(data)
|
||||
|
||||
for item in q("div.mw-category-generated").find("a"):
|
||||
if (item.text.startswith("Autor:")):
|
||||
authorname = item.text[6:]
|
||||
@ -62,27 +64,6 @@ def getAuthorBasicInfo(authorname, authorlink):
|
||||
|
||||
return info
|
||||
|
||||
# def getAuthorWikiInfo(authorinfo):
|
||||
|
||||
# # Nothing can be learned without wiki page
|
||||
# if authorinfo["wiki"] is None:
|
||||
# return authorinfo
|
||||
|
||||
# try:
|
||||
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
|
||||
# q = PyQuery(data)
|
||||
|
||||
# # Find the birth date
|
||||
# body = q("#mw-content-text").text()
|
||||
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
|
||||
# if not result is None:
|
||||
# authorinfo["birthyear"] = result.group(0)
|
||||
|
||||
# except urllib.error.HTTPError:
|
||||
# pass
|
||||
|
||||
# return authorinfo
|
||||
|
||||
def getText(url):
|
||||
data = urllib.request.urlopen(BASE_URL + url).read()
|
||||
q = PyQuery(data)
|
0
src/step1_text_processing/__init__.py
Normal file
0
src/step1_text_processing/__init__.py
Normal file
@ -1,5 +1,14 @@
|
||||
import logging
|
||||
import time
|
||||
import operator
|
||||
import storage
|
||||
# own
|
||||
from storage.data import TextStorage
|
||||
from storage.results.letterFrequencies import LetterFrequencyStorage
|
||||
|
||||
import ttl.ttlparser
|
||||
import ttl.ttlservice
|
||||
|
||||
FREQUENCY_TRESHOLD = 0.005
|
||||
|
||||
def letterFrequencies(text):
|
||||
letterfreq = [{}, {}, {}, {}]
|
||||
@ -31,8 +40,31 @@ def letterFrequencies(text):
|
||||
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
|
||||
for i in range(4):
|
||||
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
|
||||
freqFiltered = freqSorted[0:50]
|
||||
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
|
||||
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqSorted]
|
||||
freqFiltered = [(symbol, freq) for symbol, freq in freqNormalized if freq >= FREQUENCY_TRESHOLD]
|
||||
letterfreq[i] = freqNormalized
|
||||
|
||||
return letterfreq
|
||||
|
||||
return letterfreq
|
||||
|
||||
def processTexts(TextStorage, resultsStorage):
|
||||
count = TextStorage.getTextCount()
|
||||
current = 0
|
||||
for item in TextStorage.getAllTexts():
|
||||
print("Processing item", current, "out of", count)
|
||||
current = current + 1
|
||||
|
||||
itemid = item[0]
|
||||
itemtext = item[1]
|
||||
|
||||
# perform analysis
|
||||
letterFreq = letterFrequencies(itemtext)
|
||||
resultsStorage.store(itemid, letterFreq)
|
||||
|
||||
print("Finished!")
|
||||
|
||||
def main():
|
||||
TextStorage = TextStorage("data/texts.db")
|
||||
resultsStorage = LetterFrequencyStorage("data/results.db")
|
||||
processTexts(TextStorage, resultsStorage)
|
||||
|
||||
main()
|
70
src/step1_text_processing/process_word_frequencies.py
Normal file
70
src/step1_text_processing/process_word_frequencies.py
Normal file
@ -0,0 +1,70 @@
|
||||
import logging
|
||||
import time
|
||||
import operator
|
||||
import nltk.tokenize
|
||||
import nltk.tokenize.moses
|
||||
import nltk.stem.snowball
|
||||
|
||||
# own
|
||||
from storage.data import TextStorage
|
||||
from storage.results.wordFrequencies import WordFrequencyStorage
|
||||
import textutils
|
||||
|
||||
FREQUENCY_TRESHOLD = 0.001
|
||||
|
||||
def wordFrequencies(text):
|
||||
text = textutils.fixDiacritics(text)
|
||||
tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
|
||||
stemmer = nltk.stem.snowball.RomanianStemmer()
|
||||
|
||||
words = tokenizer.tokenize(text)
|
||||
frequencies = {}
|
||||
|
||||
for word in words:
|
||||
# Skip non-words
|
||||
if not textutils.isValidWord(word):
|
||||
continue
|
||||
|
||||
# use word stem
|
||||
stem = stemmer.stem(word)
|
||||
|
||||
if stem not in frequencies:
|
||||
frequencies[stem] = 1
|
||||
else:
|
||||
frequencies[stem] += 1
|
||||
|
||||
# Normalize
|
||||
freqSorted = sorted(frequencies.items(), key=operator.itemgetter(1), reverse=True)
|
||||
|
||||
freqNormalized = []
|
||||
for word, freq in freqSorted:
|
||||
freqNorm = float(freq) / len(words)
|
||||
if freqNorm >= FREQUENCY_TRESHOLD:
|
||||
freqNormalized.append((word, freqNorm))
|
||||
|
||||
return freqNormalized
|
||||
|
||||
def processTexts(TextStorage, resultsStorage):
|
||||
count = TextStorage.getTextCount()
|
||||
current = 0
|
||||
for item in TextStorage.getAllTexts():
|
||||
print("Processing item", current, "out of", count)
|
||||
current = current + 1
|
||||
|
||||
itemid = item[0]
|
||||
itemtext = item[1]
|
||||
|
||||
# perform analysis
|
||||
letterFreq = wordFrequencies(itemtext)
|
||||
|
||||
# store results
|
||||
resultsStorage.store(itemid, letterFreq)
|
||||
|
||||
print("Finished!")
|
||||
|
||||
def main():
|
||||
TextStorage = TextStorage("data/texts.db")
|
||||
resultsStorage = WordFrequencyStorage("data/results.db")
|
||||
processTexts(TextStorage, resultsStorage)
|
||||
|
||||
main()
|
60
src/step1_text_processing/process_word_lengths.py
Normal file
60
src/step1_text_processing/process_word_lengths.py
Normal file
@ -0,0 +1,60 @@
|
||||
import logging
|
||||
import time
|
||||
import operator
|
||||
import nltk.tokenize
|
||||
import nltk.tokenize.moses
|
||||
import nltk.stem.snowball
|
||||
|
||||
# own
|
||||
from storage.data import TextStorage
|
||||
from storage.results.wordLengths import WordLengthStorage
|
||||
import textutils
|
||||
|
||||
def wordLengths(text):
|
||||
text = textutils.fixDiacritics(text)
|
||||
tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
|
||||
words = tokenizer.tokenize(text)
|
||||
|
||||
lengths = {}
|
||||
|
||||
for word in words:
|
||||
# Skip non-words
|
||||
if not textutils.isValidWord(word):
|
||||
continue
|
||||
|
||||
l = len(word)
|
||||
|
||||
if l not in lengths:
|
||||
lengths[l] = 1
|
||||
else:
|
||||
lengths[l] += 1
|
||||
|
||||
# normalize
|
||||
norm_lengths = [(length, float(freq) / len(words)) for length, freq in lengths.items()]
|
||||
|
||||
return norm_lengths
|
||||
|
||||
def processTexts(TextStorage, resultsStorage):
|
||||
count = TextStorage.getTextCount()
|
||||
current = 0
|
||||
for item in TextStorage.getAllTexts():
|
||||
print("Processing item", current, "out of", count)
|
||||
current = current + 1
|
||||
|
||||
itemid = item[0]
|
||||
itemtext = item[1]
|
||||
|
||||
# perform analysis
|
||||
wordLens = wordLengths(itemtext)
|
||||
|
||||
# store results
|
||||
resultsStorage.store(itemid, wordLens)
|
||||
|
||||
print("Finished!")
|
||||
|
||||
def main():
|
||||
TextStorage = TextStorage("data/texts.db")
|
||||
resultsStorage = WordLengthStorage("data/results.db")
|
||||
processTexts(TextStorage, resultsStorage)
|
||||
|
||||
main()
|
@ -0,0 +1,33 @@
|
||||
import os.path
|
||||
import sqlite3
|
||||
|
||||
class Storage:
|
||||
def __init__(self, dbFile):
|
||||
self.__dbFile = dbFile
|
||||
self.__initialize()
|
||||
self.__con = None
|
||||
self.__cur = None
|
||||
|
||||
def __initialize(self):
|
||||
self._createDatabase()
|
||||
|
||||
def _createDatabase(self):
|
||||
pass
|
||||
|
||||
def _destroyDatabase(self):
|
||||
pass
|
||||
|
||||
def connect(self):
|
||||
self.__con = sqlite3.connect(self.__dbFile)
|
||||
self.__cur = self.__con.cursor()
|
||||
return self.__cur
|
||||
|
||||
def commit(self, doClose=True):
|
||||
self.__con.commit()
|
||||
if doClose:
|
||||
self.__cur.close()
|
||||
self.__con.close()
|
||||
|
||||
def recreateDatabase(self):
|
||||
self._destroyDatabase()
|
||||
self._createDatabase()
|
@ -1,80 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
from model import *
|
||||
import sqlite3
|
||||
|
||||
log = logging.getLogger("storage")
|
||||
|
||||
DB_FRAGMENTS = ""
|
||||
|
||||
# Commands
|
||||
|
||||
# birth location - general area, not exact location (i.e. Transylvania)
|
||||
# birth origin - rural or urban
|
||||
# studies - masters, bachelors, high school, middle school, primary school
|
||||
# occupation - comma separated if there are multiple
|
||||
# studiesAbroad - foreign cities where author studied (comma separated)
|
||||
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
|
||||
name TEXT PRIMARY KEY,
|
||||
birthYear INTEGER,
|
||||
birthLocation TEXT,
|
||||
birthOrigin TEXT,
|
||||
studies TEXT,
|
||||
occupations TEXT,
|
||||
studiesAbroad TEXT
|
||||
)"""
|
||||
|
||||
# genre - short story (nuvela), novel (roman), poem etc
|
||||
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
|
||||
# tags - other relevant information (i.e. psychological)
|
||||
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
|
||||
id INTEGER PRIMARY KEY,
|
||||
title TEXT,
|
||||
year INTEGER,
|
||||
author TEXT REFERENCES Authors(name),
|
||||
genre TEXT,
|
||||
movement TEXT,
|
||||
tags TEXT
|
||||
)"""
|
||||
|
||||
# contains the actual text
|
||||
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
|
||||
id INTEGER REFERENCES Fragments(id),
|
||||
content TEXT
|
||||
)"""
|
||||
|
||||
# Initialize databases
|
||||
def initializeFragmentDatabase(dbFile):
|
||||
global DB_FRAGMENTS
|
||||
DB_FRAGMENTS = dbFile
|
||||
|
||||
if not os.path.exists(dbFile):
|
||||
log.info("Text database %s not found. Will create database.", dbFile)
|
||||
con = sqlite3.connect(dbFile)
|
||||
c = con.cursor()
|
||||
c.execute(COMMAND_CREATE_AUTHORS)
|
||||
c.execute(COMMAND_CREATE_FRAGMENTS)
|
||||
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
|
||||
con.commit()
|
||||
con.close()
|
||||
log.info("Database created!")
|
||||
|
||||
def getTextCount():
|
||||
con = sqlite3.connect(DB_FRAGMENTS)
|
||||
c = con.cursor()
|
||||
c.execute("SELECT COUNT(*) FROM Fragments")
|
||||
item = c.fetchone()
|
||||
c.close()
|
||||
con.close()
|
||||
return item[0]
|
||||
|
||||
def getAllTexts():
|
||||
con = sqlite3.connect(DB_FRAGMENTS)
|
||||
c = con.cursor()
|
||||
c.execute("SELECT id, content FROM FragmentsContent")
|
||||
|
||||
items = c.fetchall()
|
||||
|
||||
c.close()
|
||||
con.close()
|
||||
return items
|
@ -1,84 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
from model.Word import *
|
||||
import sqlite3
|
||||
|
||||
log = logging.getLogger("storage")
|
||||
|
||||
DB_RESULTS = ""
|
||||
|
||||
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
|
||||
idtext INTEGER,
|
||||
lettergroup TEXT,
|
||||
category TEXT,
|
||||
frequency REAL
|
||||
)"""
|
||||
|
||||
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
|
||||
idtext INTEGER,
|
||||
wordIndex INTEGER,
|
||||
sentenceIndex INTEGER,
|
||||
word TEXT,
|
||||
lemma TEXT,
|
||||
analysis TEXT,
|
||||
chunk TEXT
|
||||
)"""
|
||||
|
||||
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
|
||||
# idtext INTEGER,
|
||||
# wordlength INTEGER,
|
||||
# frequency REAL
|
||||
# )"""
|
||||
|
||||
def initializeResultsDatabase(dbFile, cleanupOldData):
|
||||
global DB_RESULTS
|
||||
DB_RESULTS = dbFile
|
||||
|
||||
# cleanup old data
|
||||
if cleanupOldData:
|
||||
con = sqlite3.connect(DB_RESULTS)
|
||||
c = con.cursor()
|
||||
|
||||
try:
|
||||
c.execute("DROP TABLE LetterFrequencies")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
|
||||
|
||||
try:
|
||||
c.execute("DROP TABLE TextWords")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
c.execute(COMMAND_CREATE_TEXT_WORDS)
|
||||
|
||||
con.commit()
|
||||
c.close()
|
||||
con.close()
|
||||
|
||||
|
||||
def storeFrequencies(idtext, freq):
|
||||
con = sqlite3.connect(DB_RESULTS)
|
||||
c = con.cursor()
|
||||
|
||||
# add data
|
||||
chr = ['p', 'l1', 'l2', 'l3']
|
||||
for i in range(4):
|
||||
for let, fr in freq[i]:
|
||||
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
|
||||
|
||||
con.commit()
|
||||
c.close()
|
||||
con.close()
|
||||
|
||||
def storeTtlAnalysis(idtext, words):
|
||||
con = sqlite3.connect(DB_RESULTS)
|
||||
c = con.cursor()
|
||||
|
||||
# store words
|
||||
for word in words:
|
||||
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
|
||||
|
||||
# finish
|
||||
con.commit()
|
||||
c.close()
|
||||
con.close()
|
0
src/storage/results/__init__.py
Normal file
0
src/storage/results/__init__.py
Normal file
27
src/storage/results/letterFrequencies.py
Normal file
27
src/storage/results/letterFrequencies.py
Normal file
@ -0,0 +1,27 @@
|
||||
import storage
|
||||
|
||||
class LetterFrequencyStorage(storage.Storage):
|
||||
__COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE IF NOT EXISTS LetterFrequencies (
|
||||
idtext INTEGER,
|
||||
lettergroup TEXT,
|
||||
category TEXT,
|
||||
frequency REAL
|
||||
)"""
|
||||
|
||||
def _createDatabase(self):
|
||||
c = self.connect()
|
||||
c.execute(self.__COMMAND_CREATE_LETTER_FREQUENCIES)
|
||||
self.commit()
|
||||
|
||||
def _destroyDatabase(self):
|
||||
c = self.connect()
|
||||
c.execute('DROP TABLE IF EXISTS LetterFrequencies')
|
||||
self.commit()
|
||||
|
||||
def store(self, idtext, frequencies):
|
||||
c = self.connect()
|
||||
chr = ['p', 'l1', 'l2', 'l3']
|
||||
for i in range(4):
|
||||
for let, fr in frequencies[i]:
|
||||
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
|
||||
self.commit()
|
24
src/storage/results/wordFrequencies.py
Normal file
24
src/storage/results/wordFrequencies.py
Normal file
@ -0,0 +1,24 @@
|
||||
import storage
|
||||
|
||||
class WordFrequencyStorage(storage.Storage):
|
||||
__COMMAND_CREATE_WORD_FREQUENCIES = """CREATE TABLE IF NOT EXISTS WordFrequencies (
|
||||
idtext INTEGER,
|
||||
word TEXT,
|
||||
frequency REAL
|
||||
)"""
|
||||
|
||||
def _createDatabase(self):
|
||||
c = self.connect()
|
||||
c.execute(self.__COMMAND_CREATE_WORD_FREQUENCIES)
|
||||
self.commit()
|
||||
|
||||
def _destroyDatabase(self):
|
||||
c = self.connect()
|
||||
c.execute('DROP TABLE IF EXISTS WordFrequencies')
|
||||
self.commit()
|
||||
|
||||
def store(self, idtext, frequencies):
|
||||
c = self.connect()
|
||||
for word, freq in frequencies:
|
||||
c.execute('INSERT INTO WordFrequencies VALUES(?, ?, ?)', (idtext, word, freq))
|
||||
self.commit()
|
24
src/storage/results/wordLengths.py
Normal file
24
src/storage/results/wordLengths.py
Normal file
@ -0,0 +1,24 @@
|
||||
import storage
|
||||
|
||||
class WordLengthStorage(storage.Storage):
|
||||
__COMMAND_CREATE_WORD_LENGTHS = """CREATE TABLE IF NOT EXISTS WordLengths (
|
||||
idtext INTEGER,
|
||||
wordlength INTEGER,
|
||||
frequency REAL
|
||||
)"""
|
||||
|
||||
def _createDatabase(self):
|
||||
c = self.connect()
|
||||
c.execute(self.__COMMAND_CREATE_WORD_LENGTHS)
|
||||
self.commit()
|
||||
|
||||
def _destroyDatabase(self):
|
||||
c = self.connect()
|
||||
c.execute('DROP TABLE IF EXISTS WordLengths')
|
||||
self.commit()
|
||||
|
||||
def store(self, idtext, frequencies):
|
||||
c = self.connect()
|
||||
for length, frequency in frequencies:
|
||||
c.execute("INSERT INTO WordLengths VALUES(?, ?, ?)", (idtext, length, frequency))
|
||||
self.commit()
|
65
src/storage/texts.py
Normal file
65
src/storage/texts.py
Normal file
@ -0,0 +1,65 @@
|
||||
import storage
|
||||
|
||||
class TextStorage(storage.Storage):
|
||||
|
||||
# birth location - general area, not exact location (i.e. Transylvania)
|
||||
# birth origin - rural or urban
|
||||
# studies - masters, bachelors, high school, middle school, primary school
|
||||
# occupation - comma separated if there are multiple
|
||||
# studiesAbroad - foreign cities where author studied (comma separated)
|
||||
__COMMAND_CREATE_AUTHORS = """CREATE TABLE IF NOT EXISTS Authors (
|
||||
name TEXT PRIMARY KEY,
|
||||
birthYear INTEGER,
|
||||
birthLocation TEXT,
|
||||
birthOrigin TEXT,
|
||||
studies TEXT,
|
||||
occupations TEXT,
|
||||
studiesAbroad TEXT
|
||||
)"""
|
||||
|
||||
# genre - short story (nuvela), novel (roman), poem etc
|
||||
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
|
||||
# tags - other relevant information (i.e. psychological)
|
||||
__COMMAND_CREATE_FRAGMENTS = """CREATE TABLE IF NOT EXISTS Fragments (
|
||||
id INTEGER PRIMARY KEY,
|
||||
title TEXT,
|
||||
year INTEGER,
|
||||
author TEXT REFERENCES Authors(name),
|
||||
genre TEXT,
|
||||
movement TEXT,
|
||||
tags TEXT
|
||||
)"""
|
||||
|
||||
# contains the actual text
|
||||
__COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE IF NOT EXISTS FragmentsContent (
|
||||
id INTEGER REFERENCES Fragments(id),
|
||||
content TEXT
|
||||
)"""
|
||||
|
||||
def _createDatabase(self):
|
||||
c = self.connect()
|
||||
c.execute(self.__COMMAND_CREATE_AUTHORS)
|
||||
c.execute(self.__COMMAND_CREATE_FRAGMENTS)
|
||||
c.execute(self.__COMMAND_CREATE_FRAGMENTS_CONTENT)
|
||||
self.commit()
|
||||
|
||||
def _destroyDatabase(self):
|
||||
c = self.connect()
|
||||
c.execute('DROP TABLE IF EXISTS Authors')
|
||||
c.execute('DROP TABLE IF EXISTS Fragments')
|
||||
c.execute('DROP TABLE IF EXISTS FragmentsContent')
|
||||
self.commit()
|
||||
|
||||
def getTextCount(self):
|
||||
c = self.connect()
|
||||
c.execute("SELECT COUNT(*) FROM Fragments")
|
||||
item = c.fetchone()
|
||||
self.commit()
|
||||
return item[0]
|
||||
|
||||
def getAllTexts(self):
|
||||
c = self.connect()
|
||||
c.execute("SELECT id, content FROM FragmentsContent")
|
||||
items = c.fetchall()
|
||||
self.commit()
|
||||
return items
|
@ -1,2 +0,0 @@
|
||||
def analyzeWords(text):
|
||||
pass
|
16
src/textutils/__init__.py
Normal file
16
src/textutils/__init__.py
Normal file
@ -0,0 +1,16 @@
|
||||
def fixDiacritics(text):
|
||||
text = text.replace(u'ĭ', 'i')
|
||||
text = text.replace(u'ŭ', 'u')
|
||||
text = text.replace(u'à', 'a')
|
||||
return text
|
||||
|
||||
def isValidWord(word):
|
||||
# Alphanumeric => word
|
||||
if word.isalnum():
|
||||
return True
|
||||
|
||||
# Some words might be contractions, which finish/begin with a '
|
||||
if word[1:].isalnum() or word[:-1].isalnum():
|
||||
return True
|
||||
|
||||
return False
|
8
src/tools/test.php
Normal file
8
src/tools/test.php
Normal file
@ -0,0 +1,8 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
|
||||
<?php echo "Hello world!"?>
|
||||
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user