Refactored code; organized letter and word metrics

This commit is contained in:
2017-06-05 20:30:13 +03:00
parent 806d9cdedc
commit 64caeab14e
24 changed files with 388 additions and 243 deletions

11
src/exec.py Normal file
View File

@ -0,0 +1,11 @@
print("Acquiring texts...")
import step0_acquire.wikisource_downloader
print("Processing letter frequencies... ")
import step1_text_processing.process_letter_frequencies
print("Processing word frequencies... ")
import step1_text_processing.process_word_frequencies
print("Processing word lengths... ")
import step1_text_processing.process_word_lengths

View File

@ -1,40 +0,0 @@
import logging
import time
# own
import logger
import storage.data
import storage.results
import textprocessor.letterfreq
import ttl.ttlparser
import ttl.ttlservice
def init():
logger.init_logger(logging.WARNING)
storage.data.initializeFragmentDatabase("data/texts.db")
storage.results.initializeResultsDatabase("data/results.db", True)
def processTexts():
count = storage.data.getTextCount()
current = 0
for item in storage.data.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# obtain ttl analysis
# unfeasable - it takes 5-10 minutes for a single text
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
# storage.results.storeTtlAnalysis(itemid, words)
# perform analysis
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
storage.results.storeFrequencies(itemid, letterFreq)
print("Finished!")
init()
processTexts()

View File

@ -10,10 +10,12 @@ def getAuthorList():
authors = []
for letter in LETTERS:
print("Processing link page for letter", letter)
# Read index page
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
data = urllib.request.urlopen(url).read()
q = PyQuery(data)
for item in q("div.mw-category-generated").find("a"):
if (item.text.startswith("Autor:")):
authorname = item.text[6:]
@ -62,27 +64,6 @@ def getAuthorBasicInfo(authorname, authorlink):
return info
# def getAuthorWikiInfo(authorinfo):
# # Nothing can be learned without wiki page
# if authorinfo["wiki"] is None:
# return authorinfo
# try:
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
# q = PyQuery(data)
# # Find the birth date
# body = q("#mw-content-text").text()
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
# if not result is None:
# authorinfo["birthyear"] = result.group(0)
# except urllib.error.HTTPError:
# pass
# return authorinfo
def getText(url):
data = urllib.request.urlopen(BASE_URL + url).read()
q = PyQuery(data)

View File

View File

@ -1,5 +1,14 @@
import logging
import time
import operator
import storage
# own
from storage.data import TextStorage
from storage.results.letterFrequencies import LetterFrequencyStorage
import ttl.ttlparser
import ttl.ttlservice
FREQUENCY_TRESHOLD = 0.005
def letterFrequencies(text):
letterfreq = [{}, {}, {}, {}]
@ -31,8 +40,31 @@ def letterFrequencies(text):
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
for i in range(4):
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
freqFiltered = freqSorted[0:50]
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqSorted]
freqFiltered = [(symbol, freq) for symbol, freq in freqNormalized if freq >= FREQUENCY_TRESHOLD]
letterfreq[i] = freqNormalized
return letterfreq
return letterfreq
def processTexts(TextStorage, resultsStorage):
count = TextStorage.getTextCount()
current = 0
for item in TextStorage.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# perform analysis
letterFreq = letterFrequencies(itemtext)
resultsStorage.store(itemid, letterFreq)
print("Finished!")
def main():
TextStorage = TextStorage("data/texts.db")
resultsStorage = LetterFrequencyStorage("data/results.db")
processTexts(TextStorage, resultsStorage)
main()

View File

@ -0,0 +1,70 @@
import logging
import time
import operator
import nltk.tokenize
import nltk.tokenize.moses
import nltk.stem.snowball
# own
from storage.data import TextStorage
from storage.results.wordFrequencies import WordFrequencyStorage
import textutils
FREQUENCY_TRESHOLD = 0.001
def wordFrequencies(text):
text = textutils.fixDiacritics(text)
tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
stemmer = nltk.stem.snowball.RomanianStemmer()
words = tokenizer.tokenize(text)
frequencies = {}
for word in words:
# Skip non-words
if not textutils.isValidWord(word):
continue
# use word stem
stem = stemmer.stem(word)
if stem not in frequencies:
frequencies[stem] = 1
else:
frequencies[stem] += 1
# Normalize
freqSorted = sorted(frequencies.items(), key=operator.itemgetter(1), reverse=True)
freqNormalized = []
for word, freq in freqSorted:
freqNorm = float(freq) / len(words)
if freqNorm >= FREQUENCY_TRESHOLD:
freqNormalized.append((word, freqNorm))
return freqNormalized
def processTexts(TextStorage, resultsStorage):
count = TextStorage.getTextCount()
current = 0
for item in TextStorage.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# perform analysis
letterFreq = wordFrequencies(itemtext)
# store results
resultsStorage.store(itemid, letterFreq)
print("Finished!")
def main():
TextStorage = TextStorage("data/texts.db")
resultsStorage = WordFrequencyStorage("data/results.db")
processTexts(TextStorage, resultsStorage)
main()

View File

@ -0,0 +1,60 @@
import logging
import time
import operator
import nltk.tokenize
import nltk.tokenize.moses
import nltk.stem.snowball
# own
from storage.data import TextStorage
from storage.results.wordLengths import WordLengthStorage
import textutils
def wordLengths(text):
text = textutils.fixDiacritics(text)
tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
words = tokenizer.tokenize(text)
lengths = {}
for word in words:
# Skip non-words
if not textutils.isValidWord(word):
continue
l = len(word)
if l not in lengths:
lengths[l] = 1
else:
lengths[l] += 1
# normalize
norm_lengths = [(length, float(freq) / len(words)) for length, freq in lengths.items()]
return norm_lengths
def processTexts(TextStorage, resultsStorage):
count = TextStorage.getTextCount()
current = 0
for item in TextStorage.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# perform analysis
wordLens = wordLengths(itemtext)
# store results
resultsStorage.store(itemid, wordLens)
print("Finished!")
def main():
TextStorage = TextStorage("data/texts.db")
resultsStorage = WordLengthStorage("data/results.db")
processTexts(TextStorage, resultsStorage)
main()

View File

@ -0,0 +1,33 @@
import os.path
import sqlite3
class Storage:
def __init__(self, dbFile):
self.__dbFile = dbFile
self.__initialize()
self.__con = None
self.__cur = None
def __initialize(self):
self._createDatabase()
def _createDatabase(self):
pass
def _destroyDatabase(self):
pass
def connect(self):
self.__con = sqlite3.connect(self.__dbFile)
self.__cur = self.__con.cursor()
return self.__cur
def commit(self, doClose=True):
self.__con.commit()
if doClose:
self.__cur.close()
self.__con.close()
def recreateDatabase(self):
self._destroyDatabase()
self._createDatabase()

View File

@ -1,80 +0,0 @@
import logging
import os
from model import *
import sqlite3
log = logging.getLogger("storage")
DB_FRAGMENTS = ""
# Commands
# birth location - general area, not exact location (i.e. Transylvania)
# birth origin - rural or urban
# studies - masters, bachelors, high school, middle school, primary school
# occupation - comma separated if there are multiple
# studiesAbroad - foreign cities where author studied (comma separated)
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
name TEXT PRIMARY KEY,
birthYear INTEGER,
birthLocation TEXT,
birthOrigin TEXT,
studies TEXT,
occupations TEXT,
studiesAbroad TEXT
)"""
# genre - short story (nuvela), novel (roman), poem etc
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
# tags - other relevant information (i.e. psychological)
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
id INTEGER PRIMARY KEY,
title TEXT,
year INTEGER,
author TEXT REFERENCES Authors(name),
genre TEXT,
movement TEXT,
tags TEXT
)"""
# contains the actual text
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
id INTEGER REFERENCES Fragments(id),
content TEXT
)"""
# Initialize databases
def initializeFragmentDatabase(dbFile):
global DB_FRAGMENTS
DB_FRAGMENTS = dbFile
if not os.path.exists(dbFile):
log.info("Text database %s not found. Will create database.", dbFile)
con = sqlite3.connect(dbFile)
c = con.cursor()
c.execute(COMMAND_CREATE_AUTHORS)
c.execute(COMMAND_CREATE_FRAGMENTS)
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
con.commit()
con.close()
log.info("Database created!")
def getTextCount():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT COUNT(*) FROM Fragments")
item = c.fetchone()
c.close()
con.close()
return item[0]
def getAllTexts():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT id, content FROM FragmentsContent")
items = c.fetchall()
c.close()
con.close()
return items

View File

@ -1,84 +0,0 @@
import logging
import os
from model.Word import *
import sqlite3
log = logging.getLogger("storage")
DB_RESULTS = ""
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
idtext INTEGER,
lettergroup TEXT,
category TEXT,
frequency REAL
)"""
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
idtext INTEGER,
wordIndex INTEGER,
sentenceIndex INTEGER,
word TEXT,
lemma TEXT,
analysis TEXT,
chunk TEXT
)"""
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
# idtext INTEGER,
# wordlength INTEGER,
# frequency REAL
# )"""
def initializeResultsDatabase(dbFile, cleanupOldData):
global DB_RESULTS
DB_RESULTS = dbFile
# cleanup old data
if cleanupOldData:
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
try:
c.execute("DROP TABLE LetterFrequencies")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
try:
c.execute("DROP TABLE TextWords")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_TEXT_WORDS)
con.commit()
c.close()
con.close()
def storeFrequencies(idtext, freq):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# add data
chr = ['p', 'l1', 'l2', 'l3']
for i in range(4):
for let, fr in freq[i]:
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
con.commit()
c.close()
con.close()
def storeTtlAnalysis(idtext, words):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# store words
for word in words:
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
# finish
con.commit()
c.close()
con.close()

View File

View File

@ -0,0 +1,27 @@
import storage
class LetterFrequencyStorage(storage.Storage):
__COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE IF NOT EXISTS LetterFrequencies (
idtext INTEGER,
lettergroup TEXT,
category TEXT,
frequency REAL
)"""
def _createDatabase(self):
c = self.connect()
c.execute(self.__COMMAND_CREATE_LETTER_FREQUENCIES)
self.commit()
def _destroyDatabase(self):
c = self.connect()
c.execute('DROP TABLE IF EXISTS LetterFrequencies')
self.commit()
def store(self, idtext, frequencies):
c = self.connect()
chr = ['p', 'l1', 'l2', 'l3']
for i in range(4):
for let, fr in frequencies[i]:
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
self.commit()

View File

@ -0,0 +1,24 @@
import storage
class WordFrequencyStorage(storage.Storage):
__COMMAND_CREATE_WORD_FREQUENCIES = """CREATE TABLE IF NOT EXISTS WordFrequencies (
idtext INTEGER,
word TEXT,
frequency REAL
)"""
def _createDatabase(self):
c = self.connect()
c.execute(self.__COMMAND_CREATE_WORD_FREQUENCIES)
self.commit()
def _destroyDatabase(self):
c = self.connect()
c.execute('DROP TABLE IF EXISTS WordFrequencies')
self.commit()
def store(self, idtext, frequencies):
c = self.connect()
for word, freq in frequencies:
c.execute('INSERT INTO WordFrequencies VALUES(?, ?, ?)', (idtext, word, freq))
self.commit()

View File

@ -0,0 +1,24 @@
import storage
class WordLengthStorage(storage.Storage):
__COMMAND_CREATE_WORD_LENGTHS = """CREATE TABLE IF NOT EXISTS WordLengths (
idtext INTEGER,
wordlength INTEGER,
frequency REAL
)"""
def _createDatabase(self):
c = self.connect()
c.execute(self.__COMMAND_CREATE_WORD_LENGTHS)
self.commit()
def _destroyDatabase(self):
c = self.connect()
c.execute('DROP TABLE IF EXISTS WordLengths')
self.commit()
def store(self, idtext, frequencies):
c = self.connect()
for length, frequency in frequencies:
c.execute("INSERT INTO WordLengths VALUES(?, ?, ?)", (idtext, length, frequency))
self.commit()

65
src/storage/texts.py Normal file
View File

@ -0,0 +1,65 @@
import storage
class TextStorage(storage.Storage):
# birth location - general area, not exact location (i.e. Transylvania)
# birth origin - rural or urban
# studies - masters, bachelors, high school, middle school, primary school
# occupation - comma separated if there are multiple
# studiesAbroad - foreign cities where author studied (comma separated)
__COMMAND_CREATE_AUTHORS = """CREATE TABLE IF NOT EXISTS Authors (
name TEXT PRIMARY KEY,
birthYear INTEGER,
birthLocation TEXT,
birthOrigin TEXT,
studies TEXT,
occupations TEXT,
studiesAbroad TEXT
)"""
# genre - short story (nuvela), novel (roman), poem etc
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
# tags - other relevant information (i.e. psychological)
__COMMAND_CREATE_FRAGMENTS = """CREATE TABLE IF NOT EXISTS Fragments (
id INTEGER PRIMARY KEY,
title TEXT,
year INTEGER,
author TEXT REFERENCES Authors(name),
genre TEXT,
movement TEXT,
tags TEXT
)"""
# contains the actual text
__COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE IF NOT EXISTS FragmentsContent (
id INTEGER REFERENCES Fragments(id),
content TEXT
)"""
def _createDatabase(self):
c = self.connect()
c.execute(self.__COMMAND_CREATE_AUTHORS)
c.execute(self.__COMMAND_CREATE_FRAGMENTS)
c.execute(self.__COMMAND_CREATE_FRAGMENTS_CONTENT)
self.commit()
def _destroyDatabase(self):
c = self.connect()
c.execute('DROP TABLE IF EXISTS Authors')
c.execute('DROP TABLE IF EXISTS Fragments')
c.execute('DROP TABLE IF EXISTS FragmentsContent')
self.commit()
def getTextCount(self):
c = self.connect()
c.execute("SELECT COUNT(*) FROM Fragments")
item = c.fetchone()
self.commit()
return item[0]
def getAllTexts(self):
c = self.connect()
c.execute("SELECT id, content FROM FragmentsContent")
items = c.fetchall()
self.commit()
return items

View File

@ -1,2 +0,0 @@
def analyzeWords(text):
pass

16
src/textutils/__init__.py Normal file
View File

@ -0,0 +1,16 @@
def fixDiacritics(text):
text = text.replace(u'ĭ', 'i')
text = text.replace(u'ŭ', 'u')
text = text.replace(u'à', 'a')
return text
def isValidWord(word):
# Alphanumeric => word
if word.isalnum():
return True
# Some words might be contractions, which finish/begin with a '
if word[1:].isalnum() or word[:-1].isalnum():
return True
return False

8
src/tools/test.php Normal file
View File

@ -0,0 +1,8 @@
<!DOCTYPE html>
<html>
<body>
<?php echo "Hello world!"?>
</body>
</html>