Refactored code; organized letter and word metrics

This commit is contained in:
Tiberiu Chibici 2017-06-05 20:30:13 +03:00
parent 806d9cdedc
commit 64caeab14e
24 changed files with 388 additions and 243 deletions

Binary file not shown.

View File

@ -1,17 +1,17 @@
# Fdb version 3
["bibtex AuthorshipDetection"] 1495653012 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1495653014
"AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a ""
["bibtex AuthorshipDetection"] 1496507489 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1496507491
"AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a ""
"C:/Program Files/MiKTeX 2.9/bibtex/bst/bibtex/plain.bst" 1291868336 20613 bd3fbfa9f64872b81ac57a0dd2ed855f ""
"bibliography/bibliography.bib" 1495652619 244 59c2a4b6607d9d24b3cfb7d66faed6a6 ""
(generated)
"AuthorshipDetection.bbl"
"AuthorshipDetection.blg"
["pdflatex"] 1495653013 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1495653014
"AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a ""
"AuthorshipDetection.bbl" 1495653013 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection"
"AuthorshipDetection.tdo" 1495653014 149 5ef486a5a64ba9d20e77900a17e38ab8 ""
["pdflatex"] 1496507489 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1496507491
"AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a ""
"AuthorshipDetection.bbl" 1496507489 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection"
"AuthorshipDetection.tdo" 1496507490 149 5ef486a5a64ba9d20e77900a17e38ab8 ""
"AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
"AuthorshipDetection.toc" 1495653014 354 93710fd8d0aa2019c18a188df168978a ""
"AuthorshipDetection.toc" 1496507490 354 93710fd8d0aa2019c18a188df168978a ""
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx10.tfm" 1136768653 1328 c834bbb027764024c09d3d2bf908b5f0 ""
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx12.tfm" 1136768653 1324 c910af8c371558dc20f2d7822f66fe64 ""
"C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmmi12.tfm" 1136768653 1524 4414a8315f39513458b80dfc63bff03a ""
@ -126,15 +126,15 @@
"C:/Program Files/MiKTeX 2.9/tex/latex/tools/calc.sty" 1492423194 10503 d03d065f799d54f6b7e9b175f8d84279 ""
"C:/Program Files/MiKTeX 2.9/tex/latex/xcolor/xcolor.sty" 1463135581 57049 34128738f682d033422ca125f82e5d62 ""
"C:/Program Files/MiKTeX 2.9/tex/latex/xkeyval/xkeyval.sty" 1419274338 5114 9c1069474ff71dbc47d5006555e352d3 ""
"C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495650797 80423 3cfba73105275d3ad410701d89fbdf7a ""
"C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495981403 80423 3cfba73105275d3ad410701d89fbdf7a ""
"C:/Users/Tibi/AppData/Local/MiKTeX/2.9/miktex/data/le/pdftex/pdflatex.fmt" 1495550378 4001358 752f924a412e4944af4e01c5a9beae77 ""
"chapters/abstract.tex" 1495553267 495 58ad3ecfec349d84d898cef65bea34a8 ""
"chapters/introduction.tex" 1495651567 1996 d9cf7ce732c566423ec6b6f8f56fcd7e ""
"chapters/previouswork.tex" 1495652541 29 112f1954c35a5d96c415f13d34bcd056 ""
"d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
(generated)
"AuthorshipDetection.pdf"
"AuthorshipDetection.log"
"AuthorshipDetection.tdo"
"AuthorshipDetection.aux"
"AuthorshipDetection.toc"
"AuthorshipDetection.log"
"AuthorshipDetection.pdf"
"AuthorshipDetection.tdo"

Binary file not shown.

Binary file not shown.

11
src/exec.py Normal file
View File

@ -0,0 +1,11 @@
print("Acquiring texts...")
import step0_acquire.wikisource_downloader
print("Processing letter frequencies... ")
import step1_text_processing.process_letter_frequencies
print("Processing word frequencies... ")
import step1_text_processing.process_word_frequencies
print("Processing word lengths... ")
import step1_text_processing.process_word_lengths

View File

@ -1,40 +0,0 @@
import logging
import time
# own
import logger
import storage.data
import storage.results
import textprocessor.letterfreq
import ttl.ttlparser
import ttl.ttlservice
def init():
logger.init_logger(logging.WARNING)
storage.data.initializeFragmentDatabase("data/texts.db")
storage.results.initializeResultsDatabase("data/results.db", True)
def processTexts():
count = storage.data.getTextCount()
current = 0
for item in storage.data.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# obtain ttl analysis
# unfeasable - it takes 5-10 minutes for a single text
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
# storage.results.storeTtlAnalysis(itemid, words)
# perform analysis
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
storage.results.storeFrequencies(itemid, letterFreq)
print("Finished!")
init()
processTexts()

View File

@ -10,10 +10,12 @@ def getAuthorList():
authors = []
for letter in LETTERS:
print("Processing link page for letter", letter)
# Read index page
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
data = urllib.request.urlopen(url).read()
q = PyQuery(data)
for item in q("div.mw-category-generated").find("a"):
if (item.text.startswith("Autor:")):
authorname = item.text[6:]
@ -62,27 +64,6 @@ def getAuthorBasicInfo(authorname, authorlink):
return info
# def getAuthorWikiInfo(authorinfo):
# # Nothing can be learned without wiki page
# if authorinfo["wiki"] is None:
# return authorinfo
# try:
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
# q = PyQuery(data)
# # Find the birth date
# body = q("#mw-content-text").text()
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
# if not result is None:
# authorinfo["birthyear"] = result.group(0)
# except urllib.error.HTTPError:
# pass
# return authorinfo
def getText(url):
data = urllib.request.urlopen(BASE_URL + url).read()
q = PyQuery(data)

View File

View File

@ -1,5 +1,14 @@
import logging
import time
import operator
import storage
# own
from storage.data import TextStorage
from storage.results.letterFrequencies import LetterFrequencyStorage
import ttl.ttlparser
import ttl.ttlservice
FREQUENCY_TRESHOLD = 0.005
def letterFrequencies(text):
letterfreq = [{}, {}, {}, {}]
@ -31,8 +40,31 @@ def letterFrequencies(text):
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
for i in range(4):
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
freqFiltered = freqSorted[0:50]
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqSorted]
freqFiltered = [(symbol, freq) for symbol, freq in freqNormalized if freq >= FREQUENCY_TRESHOLD]
letterfreq[i] = freqNormalized
return letterfreq
return letterfreq
def processTexts(TextStorage, resultsStorage):
count = TextStorage.getTextCount()
current = 0
for item in TextStorage.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# perform analysis
letterFreq = letterFrequencies(itemtext)
resultsStorage.store(itemid, letterFreq)
print("Finished!")
def main():
TextStorage = TextStorage("data/texts.db")
resultsStorage = LetterFrequencyStorage("data/results.db")
processTexts(TextStorage, resultsStorage)
main()

View File

@ -0,0 +1,70 @@
import logging
import time
import operator
import nltk.tokenize
import nltk.tokenize.moses
import nltk.stem.snowball
# own
from storage.data import TextStorage
from storage.results.wordFrequencies import WordFrequencyStorage
import textutils
FREQUENCY_TRESHOLD = 0.001
def wordFrequencies(text):
text = textutils.fixDiacritics(text)
tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
stemmer = nltk.stem.snowball.RomanianStemmer()
words = tokenizer.tokenize(text)
frequencies = {}
for word in words:
# Skip non-words
if not textutils.isValidWord(word):
continue
# use word stem
stem = stemmer.stem(word)
if stem not in frequencies:
frequencies[stem] = 1
else:
frequencies[stem] += 1
# Normalize
freqSorted = sorted(frequencies.items(), key=operator.itemgetter(1), reverse=True)
freqNormalized = []
for word, freq in freqSorted:
freqNorm = float(freq) / len(words)
if freqNorm >= FREQUENCY_TRESHOLD:
freqNormalized.append((word, freqNorm))
return freqNormalized
def processTexts(TextStorage, resultsStorage):
count = TextStorage.getTextCount()
current = 0
for item in TextStorage.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# perform analysis
letterFreq = wordFrequencies(itemtext)
# store results
resultsStorage.store(itemid, letterFreq)
print("Finished!")
def main():
TextStorage = TextStorage("data/texts.db")
resultsStorage = WordFrequencyStorage("data/results.db")
processTexts(TextStorage, resultsStorage)
main()

View File

@ -0,0 +1,60 @@
import logging
import time
import operator
import nltk.tokenize
import nltk.tokenize.moses
import nltk.stem.snowball
# own
from storage.data import TextStorage
from storage.results.wordLengths import WordLengthStorage
import textutils
def wordLengths(text):
text = textutils.fixDiacritics(text)
tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
words = tokenizer.tokenize(text)
lengths = {}
for word in words:
# Skip non-words
if not textutils.isValidWord(word):
continue
l = len(word)
if l not in lengths:
lengths[l] = 1
else:
lengths[l] += 1
# normalize
norm_lengths = [(length, float(freq) / len(words)) for length, freq in lengths.items()]
return norm_lengths
def processTexts(TextStorage, resultsStorage):
count = TextStorage.getTextCount()
current = 0
for item in TextStorage.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# perform analysis
wordLens = wordLengths(itemtext)
# store results
resultsStorage.store(itemid, wordLens)
print("Finished!")
def main():
TextStorage = TextStorage("data/texts.db")
resultsStorage = WordLengthStorage("data/results.db")
processTexts(TextStorage, resultsStorage)
main()

View File

@ -0,0 +1,33 @@
import os.path
import sqlite3
class Storage:
def __init__(self, dbFile):
self.__dbFile = dbFile
self.__initialize()
self.__con = None
self.__cur = None
def __initialize(self):
self._createDatabase()
def _createDatabase(self):
pass
def _destroyDatabase(self):
pass
def connect(self):
self.__con = sqlite3.connect(self.__dbFile)
self.__cur = self.__con.cursor()
return self.__cur
def commit(self, doClose=True):
self.__con.commit()
if doClose:
self.__cur.close()
self.__con.close()
def recreateDatabase(self):
self._destroyDatabase()
self._createDatabase()

View File

@ -1,80 +0,0 @@
import logging
import os
from model import *
import sqlite3
log = logging.getLogger("storage")
DB_FRAGMENTS = ""
# Commands
# birth location - general area, not exact location (i.e. Transylvania)
# birth origin - rural or urban
# studies - masters, bachelors, high school, middle school, primary school
# occupation - comma separated if there are multiple
# studiesAbroad - foreign cities where author studied (comma separated)
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
name TEXT PRIMARY KEY,
birthYear INTEGER,
birthLocation TEXT,
birthOrigin TEXT,
studies TEXT,
occupations TEXT,
studiesAbroad TEXT
)"""
# genre - short story (nuvela), novel (roman), poem etc
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
# tags - other relevant information (i.e. psychological)
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
id INTEGER PRIMARY KEY,
title TEXT,
year INTEGER,
author TEXT REFERENCES Authors(name),
genre TEXT,
movement TEXT,
tags TEXT
)"""
# contains the actual text
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
id INTEGER REFERENCES Fragments(id),
content TEXT
)"""
# Initialize databases
def initializeFragmentDatabase(dbFile):
global DB_FRAGMENTS
DB_FRAGMENTS = dbFile
if not os.path.exists(dbFile):
log.info("Text database %s not found. Will create database.", dbFile)
con = sqlite3.connect(dbFile)
c = con.cursor()
c.execute(COMMAND_CREATE_AUTHORS)
c.execute(COMMAND_CREATE_FRAGMENTS)
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
con.commit()
con.close()
log.info("Database created!")
def getTextCount():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT COUNT(*) FROM Fragments")
item = c.fetchone()
c.close()
con.close()
return item[0]
def getAllTexts():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT id, content FROM FragmentsContent")
items = c.fetchall()
c.close()
con.close()
return items

View File

@ -1,84 +0,0 @@
import logging
import os
from model.Word import *
import sqlite3
log = logging.getLogger("storage")
DB_RESULTS = ""
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
idtext INTEGER,
lettergroup TEXT,
category TEXT,
frequency REAL
)"""
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
idtext INTEGER,
wordIndex INTEGER,
sentenceIndex INTEGER,
word TEXT,
lemma TEXT,
analysis TEXT,
chunk TEXT
)"""
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
# idtext INTEGER,
# wordlength INTEGER,
# frequency REAL
# )"""
def initializeResultsDatabase(dbFile, cleanupOldData):
global DB_RESULTS
DB_RESULTS = dbFile
# cleanup old data
if cleanupOldData:
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
try:
c.execute("DROP TABLE LetterFrequencies")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
try:
c.execute("DROP TABLE TextWords")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_TEXT_WORDS)
con.commit()
c.close()
con.close()
def storeFrequencies(idtext, freq):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# add data
chr = ['p', 'l1', 'l2', 'l3']
for i in range(4):
for let, fr in freq[i]:
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
con.commit()
c.close()
con.close()
def storeTtlAnalysis(idtext, words):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# store words
for word in words:
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
# finish
con.commit()
c.close()
con.close()

View File

View File

@ -0,0 +1,27 @@
import storage
class LetterFrequencyStorage(storage.Storage):
__COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE IF NOT EXISTS LetterFrequencies (
idtext INTEGER,
lettergroup TEXT,
category TEXT,
frequency REAL
)"""
def _createDatabase(self):
c = self.connect()
c.execute(self.__COMMAND_CREATE_LETTER_FREQUENCIES)
self.commit()
def _destroyDatabase(self):
c = self.connect()
c.execute('DROP TABLE IF EXISTS LetterFrequencies')
self.commit()
def store(self, idtext, frequencies):
c = self.connect()
chr = ['p', 'l1', 'l2', 'l3']
for i in range(4):
for let, fr in frequencies[i]:
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
self.commit()

View File

@ -0,0 +1,24 @@
import storage
class WordFrequencyStorage(storage.Storage):
__COMMAND_CREATE_WORD_FREQUENCIES = """CREATE TABLE IF NOT EXISTS WordFrequencies (
idtext INTEGER,
word TEXT,
frequency REAL
)"""
def _createDatabase(self):
c = self.connect()
c.execute(self.__COMMAND_CREATE_WORD_FREQUENCIES)
self.commit()
def _destroyDatabase(self):
c = self.connect()
c.execute('DROP TABLE IF EXISTS WordFrequencies')
self.commit()
def store(self, idtext, frequencies):
c = self.connect()
for word, freq in frequencies:
c.execute('INSERT INTO WordFrequencies VALUES(?, ?, ?)', (idtext, word, freq))
self.commit()

View File

@ -0,0 +1,24 @@
import storage
class WordLengthStorage(storage.Storage):
__COMMAND_CREATE_WORD_LENGTHS = """CREATE TABLE IF NOT EXISTS WordLengths (
idtext INTEGER,
wordlength INTEGER,
frequency REAL
)"""
def _createDatabase(self):
c = self.connect()
c.execute(self.__COMMAND_CREATE_WORD_LENGTHS)
self.commit()
def _destroyDatabase(self):
c = self.connect()
c.execute('DROP TABLE IF EXISTS WordLengths')
self.commit()
def store(self, idtext, frequencies):
c = self.connect()
for length, frequency in frequencies:
c.execute("INSERT INTO WordLengths VALUES(?, ?, ?)", (idtext, length, frequency))
self.commit()

65
src/storage/texts.py Normal file
View File

@ -0,0 +1,65 @@
import storage
class TextStorage(storage.Storage):
# birth location - general area, not exact location (i.e. Transylvania)
# birth origin - rural or urban
# studies - masters, bachelors, high school, middle school, primary school
# occupation - comma separated if there are multiple
# studiesAbroad - foreign cities where author studied (comma separated)
__COMMAND_CREATE_AUTHORS = """CREATE TABLE IF NOT EXISTS Authors (
name TEXT PRIMARY KEY,
birthYear INTEGER,
birthLocation TEXT,
birthOrigin TEXT,
studies TEXT,
occupations TEXT,
studiesAbroad TEXT
)"""
# genre - short story (nuvela), novel (roman), poem etc
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
# tags - other relevant information (i.e. psychological)
__COMMAND_CREATE_FRAGMENTS = """CREATE TABLE IF NOT EXISTS Fragments (
id INTEGER PRIMARY KEY,
title TEXT,
year INTEGER,
author TEXT REFERENCES Authors(name),
genre TEXT,
movement TEXT,
tags TEXT
)"""
# contains the actual text
__COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE IF NOT EXISTS FragmentsContent (
id INTEGER REFERENCES Fragments(id),
content TEXT
)"""
def _createDatabase(self):
c = self.connect()
c.execute(self.__COMMAND_CREATE_AUTHORS)
c.execute(self.__COMMAND_CREATE_FRAGMENTS)
c.execute(self.__COMMAND_CREATE_FRAGMENTS_CONTENT)
self.commit()
def _destroyDatabase(self):
c = self.connect()
c.execute('DROP TABLE IF EXISTS Authors')
c.execute('DROP TABLE IF EXISTS Fragments')
c.execute('DROP TABLE IF EXISTS FragmentsContent')
self.commit()
def getTextCount(self):
c = self.connect()
c.execute("SELECT COUNT(*) FROM Fragments")
item = c.fetchone()
self.commit()
return item[0]
def getAllTexts(self):
c = self.connect()
c.execute("SELECT id, content FROM FragmentsContent")
items = c.fetchall()
self.commit()
return items

View File

@ -1,2 +0,0 @@
def analyzeWords(text):
pass

16
src/textutils/__init__.py Normal file
View File

@ -0,0 +1,16 @@
def fixDiacritics(text):
text = text.replace(u'ĭ', 'i')
text = text.replace(u'ŭ', 'u')
text = text.replace(u'à', 'a')
return text
def isValidWord(word):
# Alphanumeric => word
if word.isalnum():
return True
# Some words might be contractions, which finish/begin with a '
if word[1:].isalnum() or word[:-1].isalnum():
return True
return False

8
src/tools/test.php Normal file
View File

@ -0,0 +1,8 @@
<!DOCTYPE html>
<html>
<body>
<?php echo "Hello world!"?>
</body>
</html>