Refactored code; organized letter and word metrics

2017-06-05 20:30:13 +03:00
parent 806d9cdedc
commit 64caeab14e
24 changed files with 388 additions and 243 deletions
--- a/data/results.db
+++ b/data/results.db
--- a/papers/AuthorshipDetection.fdb_latexmk
+++ b/papers/AuthorshipDetection.fdb_latexmk
@ -1,17 +1,17 @@
 # Fdb version 3
-["bibtex AuthorshipDetection"] 1495653012 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1495653014
-  "AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a ""
+["bibtex AuthorshipDetection"] 1496507489 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1496507491
+  "AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a ""
  "C:/Program Files/MiKTeX 2.9/bibtex/bst/bibtex/plain.bst" 1291868336 20613 bd3fbfa9f64872b81ac57a0dd2ed855f ""
  "bibliography/bibliography.bib" 1495652619 244 59c2a4b6607d9d24b3cfb7d66faed6a6 ""
  (generated)
  "AuthorshipDetection.bbl"
  "AuthorshipDetection.blg"
-["pdflatex"] 1495653013 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1495653014
-  "AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a ""
-  "AuthorshipDetection.bbl" 1495653013 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection"
-  "AuthorshipDetection.tdo" 1495653014 149 5ef486a5a64ba9d20e77900a17e38ab8 ""
+["pdflatex"] 1496507489 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1496507491
+  "AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a ""
+  "AuthorshipDetection.bbl" 1496507489 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection"
+  "AuthorshipDetection.tdo" 1496507490 149 5ef486a5a64ba9d20e77900a17e38ab8 ""
  "AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
-  "AuthorshipDetection.toc" 1495653014 354 93710fd8d0aa2019c18a188df168978a ""
+  "AuthorshipDetection.toc" 1496507490 354 93710fd8d0aa2019c18a188df168978a ""
  "C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx10.tfm" 1136768653 1328 c834bbb027764024c09d3d2bf908b5f0 ""
  "C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx12.tfm" 1136768653 1324 c910af8c371558dc20f2d7822f66fe64 ""
  "C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmmi12.tfm" 1136768653 1524 4414a8315f39513458b80dfc63bff03a ""
@ -126,15 +126,15 @@
  "C:/Program Files/MiKTeX 2.9/tex/latex/tools/calc.sty" 1492423194 10503 d03d065f799d54f6b7e9b175f8d84279 ""
  "C:/Program Files/MiKTeX 2.9/tex/latex/xcolor/xcolor.sty" 1463135581 57049 34128738f682d033422ca125f82e5d62 ""
  "C:/Program Files/MiKTeX 2.9/tex/latex/xkeyval/xkeyval.sty" 1419274338 5114 9c1069474ff71dbc47d5006555e352d3 ""
-  "C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495650797 80423 3cfba73105275d3ad410701d89fbdf7a ""
+  "C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495981403 80423 3cfba73105275d3ad410701d89fbdf7a ""
  "C:/Users/Tibi/AppData/Local/MiKTeX/2.9/miktex/data/le/pdftex/pdflatex.fmt" 1495550378 4001358 752f924a412e4944af4e01c5a9beae77 ""
  "chapters/abstract.tex" 1495553267 495 58ad3ecfec349d84d898cef65bea34a8 ""
  "chapters/introduction.tex" 1495651567 1996 d9cf7ce732c566423ec6b6f8f56fcd7e ""
  "chapters/previouswork.tex" 1495652541 29 112f1954c35a5d96c415f13d34bcd056 ""
  "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c ""
  (generated)
+  "AuthorshipDetection.pdf"
+  "AuthorshipDetection.log"
+  "AuthorshipDetection.tdo"
  "AuthorshipDetection.aux"
  "AuthorshipDetection.toc"
-  "AuthorshipDetection.log"
-  "AuthorshipDetection.pdf"
-  "AuthorshipDetection.tdo"
--- a/papers/AuthorshipDetection.pdf
+++ b/papers/AuthorshipDetection.pdf
--- a/papers/AuthorshipDetection.synctex.gz
+++ b/papers/AuthorshipDetection.synctex.gz
--- a/papers/AuthorshipDetection.synctex.gz(busy)
+++ b/papers/AuthorshipDetection.synctex.gz(busy)
--- a/src/exec.py
+++ b/src/exec.py
@ -0,0 +1,11 @@
+print("Acquiring texts...")
+import step0_acquire.wikisource_downloader
+
+print("Processing letter frequencies... ")
+import step1_text_processing.process_letter_frequencies
+
+print("Processing word frequencies... ")
+import step1_text_processing.process_word_frequencies
+
+print("Processing word lengths... ")
+import step1_text_processing.process_word_lengths
--- a/src/main.py
+++ b/src/main.py
@ -1,40 +0,0 @@
-import logging
-import time
-# own
-import logger
-import storage.data
-import storage.results
-import textprocessor.letterfreq
-import ttl.ttlparser
-import ttl.ttlservice
-
-def init():
-    logger.init_logger(logging.WARNING)
-    storage.data.initializeFragmentDatabase("data/texts.db")
-    storage.results.initializeResultsDatabase("data/results.db", True)
-
-def processTexts():
-    count = storage.data.getTextCount()
-    current = 0
-    for item in storage.data.getAllTexts():
-        print("Processing item", current, "out of", count)
-        current = current + 1
-        
-        itemid = item[0]
-        itemtext = item[1]
-
-        # obtain ttl analysis
-        # unfeasable - it takes 5-10 minutes for a single text
-        # ttlResult = ttl.ttlservice.executeTtl(itemtext)
-        # (words, chunks) = ttl.ttlparser.parseText(ttlResult)
-        # storage.results.storeTtlAnalysis(itemid, words)
-
-        # perform analysis
-        letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
-        storage.results.storeFrequencies(itemid, letterFreq)
-
-    print("Finished!")
-
-init()
-processTexts()
-
--- a/src/step0_acquire/init.py
+++ b/src/step0_acquire/init.py
--- a/src/step0_acquire/wikisource_downloader.py
+++ b/src/step0_acquire/wikisource_downloader.py
@ -10,10 +10,12 @@ def getAuthorList():
    authors = []
    for letter in LETTERS:
        print("Processing link page for letter", letter)    
+
        # Read index page
        url = BASE_URL + '/wiki/Categorie:Autori-' + letter
        data = urllib.request.urlopen(url).read()
        q = PyQuery(data)
+        
        for item in q("div.mw-category-generated").find("a"):
            if (item.text.startswith("Autor:")):
                authorname = item.text[6:]
@ -62,27 +64,6 @@ def getAuthorBasicInfo(authorname, authorlink):

    return info

-# def getAuthorWikiInfo(authorinfo):
-
-#     # Nothing can be learned without wiki page
-#     if authorinfo["wiki"] is None:
-#         return authorinfo
-
-#     try:
-#         data = urllib.request.urlopen(authorinfo["wiki"]).read()
-#         q = PyQuery(data)
-        
-#         # Find the birth date
-#         body = q("#mw-content-text").text()
-#         result = re.compile(u"Născut\s+([\w\s]+)").match(body)
-#         if not result is None:
-#             authorinfo["birthyear"] = result.group(0)
-
-#     except urllib.error.HTTPError:
-#         pass
-
-#     return authorinfo
-
 def getText(url):
    data = urllib.request.urlopen(BASE_URL + url).read()
    q = PyQuery(data)
--- a/src/step1_text_processing/init.py
+++ b/src/step1_text_processing/init.py
--- a/src/step1_text_processing/process_letter_frequencies.py
+++ b/src/step1_text_processing/process_letter_frequencies.py
@ -1,5 +1,14 @@
+import logging
+import time
 import operator
-import storage
+# own
+from storage.data import TextStorage
+from storage.results.letterFrequencies import LetterFrequencyStorage
+
+import ttl.ttlparser
+import ttl.ttlservice
+
+FREQUENCY_TRESHOLD = 0.005

 def letterFrequencies(text):
    letterfreq = [{}, {}, {}, {}]
@ -31,8 +40,31 @@ def letterFrequencies(text):
    # Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
    for i in range(4):
        freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
-        freqFiltered = freqSorted[0:50]
-        freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
+        freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqSorted]
+        freqFiltered = [(symbol, freq) for symbol, freq in freqNormalized if freq >= FREQUENCY_TRESHOLD]
        letterfreq[i] = freqNormalized
-    
-    return letterfreq
+
+    return letterfreq
+
+def processTexts(TextStorage, resultsStorage):
+    count = TextStorage.getTextCount()
+    current = 0
+    for item in TextStorage.getAllTexts():
+        print("Processing item", current, "out of", count)
+        current = current + 1
+        
+        itemid = item[0]
+        itemtext = item[1]
+
+        # perform analysis
+        letterFreq = letterFrequencies(itemtext)
+        resultsStorage.store(itemid, letterFreq)
+
+    print("Finished!")
+
+def main():
+    TextStorage = TextStorage("data/texts.db")
+    resultsStorage = LetterFrequencyStorage("data/results.db")
+    processTexts(TextStorage, resultsStorage)
+
+main()
--- a/src/step1_text_processing/process_word_frequencies.py
+++ b/src/step1_text_processing/process_word_frequencies.py
@ -0,0 +1,70 @@
+import logging
+import time
+import operator
+import nltk.tokenize
+import nltk.tokenize.moses
+import nltk.stem.snowball
+
+# own
+from storage.data import TextStorage
+from storage.results.wordFrequencies import WordFrequencyStorage
+import textutils
+
+FREQUENCY_TRESHOLD = 0.001
+
+def wordFrequencies(text):
+    text = textutils.fixDiacritics(text)
+    tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
+    stemmer = nltk.stem.snowball.RomanianStemmer()
+
+    words = tokenizer.tokenize(text)
+    frequencies = {}
+
+    for word in words:
+        # Skip non-words
+        if not textutils.isValidWord(word):
+            continue
+
+        # use word stem
+        stem = stemmer.stem(word)
+
+        if stem not in frequencies:
+            frequencies[stem] = 1
+        else:
+            frequencies[stem] += 1
+    
+    # Normalize
+    freqSorted = sorted(frequencies.items(), key=operator.itemgetter(1), reverse=True)
+
+    freqNormalized = []
+    for word, freq in freqSorted:
+        freqNorm = float(freq) / len(words)
+        if freqNorm >= FREQUENCY_TRESHOLD:
+            freqNormalized.append((word, freqNorm))
+
+    return freqNormalized
+
+def processTexts(TextStorage, resultsStorage):
+    count = TextStorage.getTextCount()
+    current = 0
+    for item in TextStorage.getAllTexts():
+        print("Processing item", current, "out of", count)
+        current = current + 1
+        
+        itemid = item[0]
+        itemtext = item[1]
+
+        # perform analysis
+        letterFreq = wordFrequencies(itemtext)
+
+        # store results
+        resultsStorage.store(itemid, letterFreq)
+
+    print("Finished!")
+
+def main():
+    TextStorage = TextStorage("data/texts.db")
+    resultsStorage = WordFrequencyStorage("data/results.db")
+    processTexts(TextStorage, resultsStorage)
+
+main()
--- a/src/step1_text_processing/process_word_lengths.py
+++ b/src/step1_text_processing/process_word_lengths.py
@ -0,0 +1,60 @@
+import logging
+import time
+import operator
+import nltk.tokenize
+import nltk.tokenize.moses
+import nltk.stem.snowball
+
+# own
+from storage.data import TextStorage
+from storage.results.wordLengths import WordLengthStorage
+import textutils
+
+def wordLengths(text):
+    text = textutils.fixDiacritics(text)
+    tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
+    words = tokenizer.tokenize(text)
+    
+    lengths = {}
+
+    for word in words:
+        # Skip non-words
+        if not textutils.isValidWord(word):
+            continue
+
+        l = len(word)
+        
+        if l not in lengths:
+            lengths[l] = 1
+        else:
+            lengths[l] += 1
+
+    # normalize
+    norm_lengths = [(length, float(freq) / len(words)) for length, freq in lengths.items()]
+
+    return norm_lengths
+
+def processTexts(TextStorage, resultsStorage):
+    count = TextStorage.getTextCount()
+    current = 0
+    for item in TextStorage.getAllTexts():
+        print("Processing item", current, "out of", count)
+        current = current + 1
+        
+        itemid = item[0]
+        itemtext = item[1]
+
+        # perform analysis
+        wordLens = wordLengths(itemtext)
+
+        # store results
+        resultsStorage.store(itemid, wordLens)
+
+    print("Finished!")
+
+def main():
+    TextStorage = TextStorage("data/texts.db")
+    resultsStorage = WordLengthStorage("data/results.db")
+    processTexts(TextStorage, resultsStorage)
+
+main()
--- a/src/storage/init.py
+++ b/src/storage/init.py
@ -0,0 +1,33 @@
+import os.path
+import sqlite3
+
+class Storage:
+    def __init__(self, dbFile):
+        self.__dbFile = dbFile
+        self.__initialize()
+        self.__con = None
+        self.__cur = None
+
+    def __initialize(self):
+        self._createDatabase()
+
+    def _createDatabase(self):
+        pass
+
+    def _destroyDatabase(self):
+        pass
+
+    def connect(self):
+        self.__con = sqlite3.connect(self.__dbFile)
+        self.__cur = self.__con.cursor()
+        return self.__cur
+
+    def commit(self, doClose=True):
+        self.__con.commit()
+        if doClose:
+            self.__cur.close()
+            self.__con.close()
+
+    def recreateDatabase(self):
+        self._destroyDatabase()
+        self._createDatabase()
--- a/src/storage/data.py
+++ b/src/storage/data.py
@ -1,80 +0,0 @@
-import logging
-import os
-from model import *
-import sqlite3
-
-log = logging.getLogger("storage")
-
-DB_FRAGMENTS = ""
-
-# Commands
-
-# birth location - general area, not exact location (i.e. Transylvania)
-# birth origin - rural or urban
-# studies - masters, bachelors, high school, middle school, primary school
-# occupation - comma separated if there are multiple
-# studiesAbroad - foreign cities where author studied (comma separated)
-COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
-    name TEXT PRIMARY KEY,
-    birthYear INTEGER,
-    birthLocation TEXT,
-    birthOrigin TEXT,
-    studies TEXT,
-    occupations TEXT,
-    studiesAbroad TEXT
-    )"""
-
-# genre - short story (nuvela), novel (roman), poem etc
-# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
-# tags - other relevant information (i.e. psychological)
-COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
-    id INTEGER PRIMARY KEY,
-    title TEXT,
-    year INTEGER,
-    author TEXT REFERENCES Authors(name),
-    genre TEXT,
-    movement TEXT,
-    tags TEXT
-    )"""
-
-# contains the actual text
-COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
-    id INTEGER REFERENCES Fragments(id),
-    content TEXT
-    )"""
-
-# Initialize databases
-def initializeFragmentDatabase(dbFile):
-    global DB_FRAGMENTS
-    DB_FRAGMENTS = dbFile
-
-    if not os.path.exists(dbFile):
-        log.info("Text database %s not found. Will create database.", dbFile)
-        con = sqlite3.connect(dbFile)
-        c = con.cursor()
-        c.execute(COMMAND_CREATE_AUTHORS)
-        c.execute(COMMAND_CREATE_FRAGMENTS)
-        c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
-        con.commit()
-        con.close()
-        log.info("Database created!")
-
-def getTextCount():
-    con = sqlite3.connect(DB_FRAGMENTS)
-    c = con.cursor()
-    c.execute("SELECT COUNT(*) FROM Fragments")
-    item = c.fetchone()
-    c.close()
-    con.close()
-    return item[0]
-
-def getAllTexts():
-    con = sqlite3.connect(DB_FRAGMENTS)
-    c = con.cursor()
-    c.execute("SELECT id, content FROM FragmentsContent")
-
-    items = c.fetchall()
-
-    c.close()
-    con.close()
-    return items
--- a/src/storage/results.py
+++ b/src/storage/results.py
@ -1,84 +0,0 @@
-import logging
-import os
-from model.Word import *
-import sqlite3
-
-log = logging.getLogger("storage")
-
-DB_RESULTS = ""
-
-COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
-    idtext INTEGER,
-    lettergroup TEXT,
-    category TEXT,
-    frequency REAL
-    )"""
-
-COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
-    idtext INTEGER,
-    wordIndex INTEGER,
-    sentenceIndex INTEGER,
-    word TEXT,
-    lemma TEXT,
-    analysis TEXT,
-    chunk TEXT
-    )"""
-
-# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
-#     idtext INTEGER,
-#     wordlength INTEGER,
-#     frequency REAL
-#     )"""
-
-def initializeResultsDatabase(dbFile, cleanupOldData):
-    global DB_RESULTS
-    DB_RESULTS = dbFile
-
-    # cleanup old data
-    if cleanupOldData:
-        con = sqlite3.connect(DB_RESULTS)
-        c = con.cursor()
-
-        try:
-            c.execute("DROP TABLE LetterFrequencies")
-        except sqlite3.OperationalError:
-            pass
-        c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
-
-        try:
-            c.execute("DROP TABLE TextWords")
-        except sqlite3.OperationalError:
-            pass
-        c.execute(COMMAND_CREATE_TEXT_WORDS)
-
-        con.commit()
-        c.close()
-        con.close()
-
-
-def storeFrequencies(idtext, freq):
-    con = sqlite3.connect(DB_RESULTS)
-    c = con.cursor()
-
-    # add data
-    chr = ['p', 'l1', 'l2', 'l3']
-    for i in range(4):
-        for let, fr in freq[i]:
-            c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
-
-    con.commit()
-    c.close()
-    con.close()
-
-def storeTtlAnalysis(idtext, words):
-    con = sqlite3.connect(DB_RESULTS)
-    c = con.cursor()
-    
-    # store words
-    for word in words:
-        c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
-
-    # finish
-    con.commit()
-    c.close()
-    con.close()
--- a/src/storage/results/init.py
+++ b/src/storage/results/init.py
--- a/src/storage/results/letterFrequencies.py
+++ b/src/storage/results/letterFrequencies.py
@ -0,0 +1,27 @@
+import storage
+
+class LetterFrequencyStorage(storage.Storage):
+    __COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE IF NOT EXISTS LetterFrequencies (
+        idtext INTEGER,
+        lettergroup TEXT,
+        category TEXT,
+        frequency REAL
+        )"""
+
+    def _createDatabase(self):
+        c = self.connect()
+        c.execute(self.__COMMAND_CREATE_LETTER_FREQUENCIES)
+        self.commit()
+
+    def _destroyDatabase(self):
+        c = self.connect()
+        c.execute('DROP TABLE IF EXISTS LetterFrequencies')
+        self.commit()
+
+    def store(self, idtext, frequencies):
+        c = self.connect()
+        chr = ['p', 'l1', 'l2', 'l3']
+        for i in range(4):
+            for let, fr in frequencies[i]:
+                c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
+        self.commit()
--- a/src/storage/results/wordFrequencies.py
+++ b/src/storage/results/wordFrequencies.py
@ -0,0 +1,24 @@
+import storage
+
+class WordFrequencyStorage(storage.Storage):
+    __COMMAND_CREATE_WORD_FREQUENCIES = """CREATE TABLE IF NOT EXISTS WordFrequencies (
+        idtext INTEGER,
+        word TEXT,
+        frequency REAL
+        )"""
+
+    def _createDatabase(self):
+        c = self.connect()
+        c.execute(self.__COMMAND_CREATE_WORD_FREQUENCIES)
+        self.commit()
+
+    def _destroyDatabase(self):
+        c = self.connect()
+        c.execute('DROP TABLE IF EXISTS WordFrequencies')
+        self.commit()
+
+    def store(self, idtext, frequencies):
+        c = self.connect()
+        for word, freq in frequencies:
+            c.execute('INSERT INTO WordFrequencies VALUES(?, ?, ?)', (idtext, word, freq))
+        self.commit()
--- a/src/storage/results/wordLengths.py
+++ b/src/storage/results/wordLengths.py
@ -0,0 +1,24 @@
+import storage
+
+class WordLengthStorage(storage.Storage):
+    __COMMAND_CREATE_WORD_LENGTHS = """CREATE TABLE IF NOT EXISTS WordLengths (
+        idtext INTEGER,
+        wordlength INTEGER,
+        frequency REAL
+        )"""
+
+    def _createDatabase(self):
+        c = self.connect()
+        c.execute(self.__COMMAND_CREATE_WORD_LENGTHS)
+        self.commit()
+
+    def _destroyDatabase(self):
+        c = self.connect()
+        c.execute('DROP TABLE IF EXISTS WordLengths')
+        self.commit()
+
+    def store(self, idtext, frequencies):
+        c = self.connect()
+        for length, frequency in frequencies:
+            c.execute("INSERT INTO WordLengths VALUES(?, ?, ?)", (idtext, length, frequency))
+        self.commit()
--- a/src/storage/texts.py
+++ b/src/storage/texts.py
@ -0,0 +1,65 @@
+import storage
+
+class TextStorage(storage.Storage):
+
+    # birth location - general area, not exact location (i.e. Transylvania)
+    # birth origin - rural or urban
+    # studies - masters, bachelors, high school, middle school, primary school
+    # occupation - comma separated if there are multiple
+    # studiesAbroad - foreign cities where author studied (comma separated)
+    __COMMAND_CREATE_AUTHORS = """CREATE TABLE IF NOT EXISTS Authors (
+        name TEXT PRIMARY KEY,
+        birthYear INTEGER,
+        birthLocation TEXT,
+        birthOrigin TEXT,
+        studies TEXT,
+        occupations TEXT,
+        studiesAbroad TEXT
+        )"""
+
+    # genre - short story (nuvela), novel (roman), poem etc
+    # movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
+    # tags - other relevant information (i.e. psychological)
+    __COMMAND_CREATE_FRAGMENTS = """CREATE TABLE IF NOT EXISTS Fragments (
+        id INTEGER PRIMARY KEY,
+        title TEXT,
+        year INTEGER,
+        author TEXT REFERENCES Authors(name),
+        genre TEXT,
+        movement TEXT,
+        tags TEXT
+        )"""
+
+    # contains the actual text
+    __COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE IF NOT EXISTS FragmentsContent (
+        id INTEGER REFERENCES Fragments(id),
+        content TEXT
+        )"""
+
+    def _createDatabase(self):
+        c = self.connect()
+        c.execute(self.__COMMAND_CREATE_AUTHORS)
+        c.execute(self.__COMMAND_CREATE_FRAGMENTS)
+        c.execute(self.__COMMAND_CREATE_FRAGMENTS_CONTENT)
+        self.commit()
+
+    def _destroyDatabase(self):
+        c = self.connect()
+        c.execute('DROP TABLE IF EXISTS Authors')
+        c.execute('DROP TABLE IF EXISTS Fragments')
+        c.execute('DROP TABLE IF EXISTS FragmentsContent')
+        self.commit()
+
+    def getTextCount(self):
+        c = self.connect()
+        c.execute("SELECT COUNT(*) FROM Fragments")
+        item = c.fetchone()
+        self.commit()
+        return item[0]
+
+    def getAllTexts(self):
+        c = self.connect()
+        c.execute("SELECT id, content FROM FragmentsContent")
+        items = c.fetchall()
+        self.commit()
+        return items
--- a/src/textprocessor/wordanalysis.py
+++ b/src/textprocessor/wordanalysis.py
@ -1,2 +0,0 @@
-def analyzeWords(text):
-    pass
--- a/src/textutils/init.py
+++ b/src/textutils/init.py
@ -0,0 +1,16 @@
+def fixDiacritics(text):
+    text = text.replace(u'ĭ', 'i')
+    text = text.replace(u'ŭ', 'u')
+    text = text.replace(u'à', 'a')
+    return text
+
+def isValidWord(word):
+    # Alphanumeric => word
+    if word.isalnum():
+        return True
+
+    # Some words might be contractions, which finish/begin with a '
+    if word[1:].isalnum() or word[:-1].isalnum():
+        return True
+
+    return False
--- a/src/tools/test.php
+++ b/src/tools/test.php
@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+<body>
+
+<?php echo "Hello world!"?>
+
+</body>
+</html>