Refactored code; organized letter and word metrics

2017-06-05 20:30:13 +03:00
parent 806d9cdedc
commit 64caeab14e
24 changed files with 388 additions and 243 deletions
--- a/src/exec.py
+++ b/src/exec.py
@ -0,0 +1,11 @@
+print("Acquiring texts...")
+import step0_acquire.wikisource_downloader
+
+print("Processing letter frequencies... ")
+import step1_text_processing.process_letter_frequencies
+
+print("Processing word frequencies... ")
+import step1_text_processing.process_word_frequencies
+
+print("Processing word lengths... ")
+import step1_text_processing.process_word_lengths
--- a/src/main.py
+++ b/src/main.py
@ -1,40 +0,0 @@
-import logging
-import time
-# own
-import logger
-import storage.data
-import storage.results
-import textprocessor.letterfreq
-import ttl.ttlparser
-import ttl.ttlservice
-
-def init():
-    logger.init_logger(logging.WARNING)
-    storage.data.initializeFragmentDatabase("data/texts.db")
-    storage.results.initializeResultsDatabase("data/results.db", True)
-
-def processTexts():
-    count = storage.data.getTextCount()
-    current = 0
-    for item in storage.data.getAllTexts():
-        print("Processing item", current, "out of", count)
-        current = current + 1
-        
-        itemid = item[0]
-        itemtext = item[1]
-
-        # obtain ttl analysis
-        # unfeasable - it takes 5-10 minutes for a single text
-        # ttlResult = ttl.ttlservice.executeTtl(itemtext)
-        # (words, chunks) = ttl.ttlparser.parseText(ttlResult)
-        # storage.results.storeTtlAnalysis(itemid, words)
-
-        # perform analysis
-        letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
-        storage.results.storeFrequencies(itemid, letterFreq)
-
-    print("Finished!")
-
-init()
-processTexts()
-
--- a/src/step0_acquire/init.py
+++ b/src/step0_acquire/init.py
--- a/src/step0_acquire/wikisource_downloader.py
+++ b/src/step0_acquire/wikisource_downloader.py
@ -10,10 +10,12 @@ def getAuthorList():
    authors = []
    for letter in LETTERS:
        print("Processing link page for letter", letter)    
+
        # Read index page
        url = BASE_URL + '/wiki/Categorie:Autori-' + letter
        data = urllib.request.urlopen(url).read()
        q = PyQuery(data)
+        
        for item in q("div.mw-category-generated").find("a"):
            if (item.text.startswith("Autor:")):
                authorname = item.text[6:]
@ -62,27 +64,6 @@ def getAuthorBasicInfo(authorname, authorlink):

    return info

-# def getAuthorWikiInfo(authorinfo):
-
-#     # Nothing can be learned without wiki page
-#     if authorinfo["wiki"] is None:
-#         return authorinfo
-
-#     try:
-#         data = urllib.request.urlopen(authorinfo["wiki"]).read()
-#         q = PyQuery(data)
-        
-#         # Find the birth date
-#         body = q("#mw-content-text").text()
-#         result = re.compile(u"Născut\s+([\w\s]+)").match(body)
-#         if not result is None:
-#             authorinfo["birthyear"] = result.group(0)
-
-#     except urllib.error.HTTPError:
-#         pass
-
-#     return authorinfo
-
 def getText(url):
    data = urllib.request.urlopen(BASE_URL + url).read()
    q = PyQuery(data)
--- a/src/step1_text_processing/init.py
+++ b/src/step1_text_processing/init.py
--- a/src/step1_text_processing/process_letter_frequencies.py
+++ b/src/step1_text_processing/process_letter_frequencies.py
@ -1,5 +1,14 @@
+import logging
+import time
 import operator
-import storage
+# own
+from storage.data import TextStorage
+from storage.results.letterFrequencies import LetterFrequencyStorage
+
+import ttl.ttlparser
+import ttl.ttlservice
+
+FREQUENCY_TRESHOLD = 0.005

 def letterFrequencies(text):
    letterfreq = [{}, {}, {}, {}]
@ -31,8 +40,31 @@ def letterFrequencies(text):
    # Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
    for i in range(4):
        freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
-        freqFiltered = freqSorted[0:50]
-        freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
+        freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqSorted]
+        freqFiltered = [(symbol, freq) for symbol, freq in freqNormalized if freq >= FREQUENCY_TRESHOLD]
        letterfreq[i] = freqNormalized
-    
-    return letterfreq
+
+    return letterfreq
+
+def processTexts(TextStorage, resultsStorage):
+    count = TextStorage.getTextCount()
+    current = 0
+    for item in TextStorage.getAllTexts():
+        print("Processing item", current, "out of", count)
+        current = current + 1
+        
+        itemid = item[0]
+        itemtext = item[1]
+
+        # perform analysis
+        letterFreq = letterFrequencies(itemtext)
+        resultsStorage.store(itemid, letterFreq)
+
+    print("Finished!")
+
+def main():
+    TextStorage = TextStorage("data/texts.db")
+    resultsStorage = LetterFrequencyStorage("data/results.db")
+    processTexts(TextStorage, resultsStorage)
+
+main()
--- a/src/step1_text_processing/process_word_frequencies.py
+++ b/src/step1_text_processing/process_word_frequencies.py
@ -0,0 +1,70 @@
+import logging
+import time
+import operator
+import nltk.tokenize
+import nltk.tokenize.moses
+import nltk.stem.snowball
+
+# own
+from storage.data import TextStorage
+from storage.results.wordFrequencies import WordFrequencyStorage
+import textutils
+
+FREQUENCY_TRESHOLD = 0.001
+
+def wordFrequencies(text):
+    text = textutils.fixDiacritics(text)
+    tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
+    stemmer = nltk.stem.snowball.RomanianStemmer()
+
+    words = tokenizer.tokenize(text)
+    frequencies = {}
+
+    for word in words:
+        # Skip non-words
+        if not textutils.isValidWord(word):
+            continue
+
+        # use word stem
+        stem = stemmer.stem(word)
+
+        if stem not in frequencies:
+            frequencies[stem] = 1
+        else:
+            frequencies[stem] += 1
+    
+    # Normalize
+    freqSorted = sorted(frequencies.items(), key=operator.itemgetter(1), reverse=True)
+
+    freqNormalized = []
+    for word, freq in freqSorted:
+        freqNorm = float(freq) / len(words)
+        if freqNorm >= FREQUENCY_TRESHOLD:
+            freqNormalized.append((word, freqNorm))
+
+    return freqNormalized
+
+def processTexts(TextStorage, resultsStorage):
+    count = TextStorage.getTextCount()
+    current = 0
+    for item in TextStorage.getAllTexts():
+        print("Processing item", current, "out of", count)
+        current = current + 1
+        
+        itemid = item[0]
+        itemtext = item[1]
+
+        # perform analysis
+        letterFreq = wordFrequencies(itemtext)
+
+        # store results
+        resultsStorage.store(itemid, letterFreq)
+
+    print("Finished!")
+
+def main():
+    TextStorage = TextStorage("data/texts.db")
+    resultsStorage = WordFrequencyStorage("data/results.db")
+    processTexts(TextStorage, resultsStorage)
+
+main()
--- a/src/step1_text_processing/process_word_lengths.py
+++ b/src/step1_text_processing/process_word_lengths.py
@ -0,0 +1,60 @@
+import logging
+import time
+import operator
+import nltk.tokenize
+import nltk.tokenize.moses
+import nltk.stem.snowball
+
+# own
+from storage.data import TextStorage
+from storage.results.wordLengths import WordLengthStorage
+import textutils
+
+def wordLengths(text):
+    text = textutils.fixDiacritics(text)
+    tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro')
+    words = tokenizer.tokenize(text)
+    
+    lengths = {}
+
+    for word in words:
+        # Skip non-words
+        if not textutils.isValidWord(word):
+            continue
+
+        l = len(word)
+        
+        if l not in lengths:
+            lengths[l] = 1
+        else:
+            lengths[l] += 1
+
+    # normalize
+    norm_lengths = [(length, float(freq) / len(words)) for length, freq in lengths.items()]
+
+    return norm_lengths
+
+def processTexts(TextStorage, resultsStorage):
+    count = TextStorage.getTextCount()
+    current = 0
+    for item in TextStorage.getAllTexts():
+        print("Processing item", current, "out of", count)
+        current = current + 1
+        
+        itemid = item[0]
+        itemtext = item[1]
+
+        # perform analysis
+        wordLens = wordLengths(itemtext)
+
+        # store results
+        resultsStorage.store(itemid, wordLens)
+
+    print("Finished!")
+
+def main():
+    TextStorage = TextStorage("data/texts.db")
+    resultsStorage = WordLengthStorage("data/results.db")
+    processTexts(TextStorage, resultsStorage)
+
+main()
--- a/src/storage/init.py
+++ b/src/storage/init.py
@ -0,0 +1,33 @@
+import os.path
+import sqlite3
+
+class Storage:
+    def __init__(self, dbFile):
+        self.__dbFile = dbFile
+        self.__initialize()
+        self.__con = None
+        self.__cur = None
+
+    def __initialize(self):
+        self._createDatabase()
+
+    def _createDatabase(self):
+        pass
+
+    def _destroyDatabase(self):
+        pass
+
+    def connect(self):
+        self.__con = sqlite3.connect(self.__dbFile)
+        self.__cur = self.__con.cursor()
+        return self.__cur
+
+    def commit(self, doClose=True):
+        self.__con.commit()
+        if doClose:
+            self.__cur.close()
+            self.__con.close()
+
+    def recreateDatabase(self):
+        self._destroyDatabase()
+        self._createDatabase()
--- a/src/storage/data.py
+++ b/src/storage/data.py
@ -1,80 +0,0 @@
-import logging
-import os
-from model import *
-import sqlite3
-
-log = logging.getLogger("storage")
-
-DB_FRAGMENTS = ""
-
-# Commands
-
-# birth location - general area, not exact location (i.e. Transylvania)
-# birth origin - rural or urban
-# studies - masters, bachelors, high school, middle school, primary school
-# occupation - comma separated if there are multiple
-# studiesAbroad - foreign cities where author studied (comma separated)
-COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
-    name TEXT PRIMARY KEY,
-    birthYear INTEGER,
-    birthLocation TEXT,
-    birthOrigin TEXT,
-    studies TEXT,
-    occupations TEXT,
-    studiesAbroad TEXT
-    )"""
-
-# genre - short story (nuvela), novel (roman), poem etc
-# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
-# tags - other relevant information (i.e. psychological)
-COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
-    id INTEGER PRIMARY KEY,
-    title TEXT,
-    year INTEGER,
-    author TEXT REFERENCES Authors(name),
-    genre TEXT,
-    movement TEXT,
-    tags TEXT
-    )"""
-
-# contains the actual text
-COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
-    id INTEGER REFERENCES Fragments(id),
-    content TEXT
-    )"""
-
-# Initialize databases
-def initializeFragmentDatabase(dbFile):
-    global DB_FRAGMENTS
-    DB_FRAGMENTS = dbFile
-
-    if not os.path.exists(dbFile):
-        log.info("Text database %s not found. Will create database.", dbFile)
-        con = sqlite3.connect(dbFile)
-        c = con.cursor()
-        c.execute(COMMAND_CREATE_AUTHORS)
-        c.execute(COMMAND_CREATE_FRAGMENTS)
-        c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
-        con.commit()
-        con.close()
-        log.info("Database created!")
-
-def getTextCount():
-    con = sqlite3.connect(DB_FRAGMENTS)
-    c = con.cursor()
-    c.execute("SELECT COUNT(*) FROM Fragments")
-    item = c.fetchone()
-    c.close()
-    con.close()
-    return item[0]
-
-def getAllTexts():
-    con = sqlite3.connect(DB_FRAGMENTS)
-    c = con.cursor()
-    c.execute("SELECT id, content FROM FragmentsContent")
-
-    items = c.fetchall()
-
-    c.close()
-    con.close()
-    return items
--- a/src/storage/results.py
+++ b/src/storage/results.py
@ -1,84 +0,0 @@
-import logging
-import os
-from model.Word import *
-import sqlite3
-
-log = logging.getLogger("storage")
-
-DB_RESULTS = ""
-
-COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
-    idtext INTEGER,
-    lettergroup TEXT,
-    category TEXT,
-    frequency REAL
-    )"""
-
-COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
-    idtext INTEGER,
-    wordIndex INTEGER,
-    sentenceIndex INTEGER,
-    word TEXT,
-    lemma TEXT,
-    analysis TEXT,
-    chunk TEXT
-    )"""
-
-# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
-#     idtext INTEGER,
-#     wordlength INTEGER,
-#     frequency REAL
-#     )"""
-
-def initializeResultsDatabase(dbFile, cleanupOldData):
-    global DB_RESULTS
-    DB_RESULTS = dbFile
-
-    # cleanup old data
-    if cleanupOldData:
-        con = sqlite3.connect(DB_RESULTS)
-        c = con.cursor()
-
-        try:
-            c.execute("DROP TABLE LetterFrequencies")
-        except sqlite3.OperationalError:
-            pass
-        c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
-
-        try:
-            c.execute("DROP TABLE TextWords")
-        except sqlite3.OperationalError:
-            pass
-        c.execute(COMMAND_CREATE_TEXT_WORDS)
-
-        con.commit()
-        c.close()
-        con.close()
-
-
-def storeFrequencies(idtext, freq):
-    con = sqlite3.connect(DB_RESULTS)
-    c = con.cursor()
-
-    # add data
-    chr = ['p', 'l1', 'l2', 'l3']
-    for i in range(4):
-        for let, fr in freq[i]:
-            c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
-
-    con.commit()
-    c.close()
-    con.close()
-
-def storeTtlAnalysis(idtext, words):
-    con = sqlite3.connect(DB_RESULTS)
-    c = con.cursor()
-    
-    # store words
-    for word in words:
-        c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
-
-    # finish
-    con.commit()
-    c.close()
-    con.close()
--- a/src/storage/results/init.py
+++ b/src/storage/results/init.py
--- a/src/storage/results/letterFrequencies.py
+++ b/src/storage/results/letterFrequencies.py
@ -0,0 +1,27 @@
+import storage
+
+class LetterFrequencyStorage(storage.Storage):
+    __COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE IF NOT EXISTS LetterFrequencies (
+        idtext INTEGER,
+        lettergroup TEXT,
+        category TEXT,
+        frequency REAL
+        )"""
+
+    def _createDatabase(self):
+        c = self.connect()
+        c.execute(self.__COMMAND_CREATE_LETTER_FREQUENCIES)
+        self.commit()
+
+    def _destroyDatabase(self):
+        c = self.connect()
+        c.execute('DROP TABLE IF EXISTS LetterFrequencies')
+        self.commit()
+
+    def store(self, idtext, frequencies):
+        c = self.connect()
+        chr = ['p', 'l1', 'l2', 'l3']
+        for i in range(4):
+            for let, fr in frequencies[i]:
+                c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
+        self.commit()
--- a/src/storage/results/wordFrequencies.py
+++ b/src/storage/results/wordFrequencies.py
@ -0,0 +1,24 @@
+import storage
+
+class WordFrequencyStorage(storage.Storage):
+    __COMMAND_CREATE_WORD_FREQUENCIES = """CREATE TABLE IF NOT EXISTS WordFrequencies (
+        idtext INTEGER,
+        word TEXT,
+        frequency REAL
+        )"""
+
+    def _createDatabase(self):
+        c = self.connect()
+        c.execute(self.__COMMAND_CREATE_WORD_FREQUENCIES)
+        self.commit()
+
+    def _destroyDatabase(self):
+        c = self.connect()
+        c.execute('DROP TABLE IF EXISTS WordFrequencies')
+        self.commit()
+
+    def store(self, idtext, frequencies):
+        c = self.connect()
+        for word, freq in frequencies:
+            c.execute('INSERT INTO WordFrequencies VALUES(?, ?, ?)', (idtext, word, freq))
+        self.commit()
--- a/src/storage/results/wordLengths.py
+++ b/src/storage/results/wordLengths.py
@ -0,0 +1,24 @@
+import storage
+
+class WordLengthStorage(storage.Storage):
+    __COMMAND_CREATE_WORD_LENGTHS = """CREATE TABLE IF NOT EXISTS WordLengths (
+        idtext INTEGER,
+        wordlength INTEGER,
+        frequency REAL
+        )"""
+
+    def _createDatabase(self):
+        c = self.connect()
+        c.execute(self.__COMMAND_CREATE_WORD_LENGTHS)
+        self.commit()
+
+    def _destroyDatabase(self):
+        c = self.connect()
+        c.execute('DROP TABLE IF EXISTS WordLengths')
+        self.commit()
+
+    def store(self, idtext, frequencies):
+        c = self.connect()
+        for length, frequency in frequencies:
+            c.execute("INSERT INTO WordLengths VALUES(?, ?, ?)", (idtext, length, frequency))
+        self.commit()
--- a/src/storage/texts.py
+++ b/src/storage/texts.py
@ -0,0 +1,65 @@
+import storage
+
+class TextStorage(storage.Storage):
+
+    # birth location - general area, not exact location (i.e. Transylvania)
+    # birth origin - rural or urban
+    # studies - masters, bachelors, high school, middle school, primary school
+    # occupation - comma separated if there are multiple
+    # studiesAbroad - foreign cities where author studied (comma separated)
+    __COMMAND_CREATE_AUTHORS = """CREATE TABLE IF NOT EXISTS Authors (
+        name TEXT PRIMARY KEY,
+        birthYear INTEGER,
+        birthLocation TEXT,
+        birthOrigin TEXT,
+        studies TEXT,
+        occupations TEXT,
+        studiesAbroad TEXT
+        )"""
+
+    # genre - short story (nuvela), novel (roman), poem etc
+    # movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
+    # tags - other relevant information (i.e. psychological)
+    __COMMAND_CREATE_FRAGMENTS = """CREATE TABLE IF NOT EXISTS Fragments (
+        id INTEGER PRIMARY KEY,
+        title TEXT,
+        year INTEGER,
+        author TEXT REFERENCES Authors(name),
+        genre TEXT,
+        movement TEXT,
+        tags TEXT
+        )"""
+
+    # contains the actual text
+    __COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE IF NOT EXISTS FragmentsContent (
+        id INTEGER REFERENCES Fragments(id),
+        content TEXT
+        )"""
+
+    def _createDatabase(self):
+        c = self.connect()
+        c.execute(self.__COMMAND_CREATE_AUTHORS)
+        c.execute(self.__COMMAND_CREATE_FRAGMENTS)
+        c.execute(self.__COMMAND_CREATE_FRAGMENTS_CONTENT)
+        self.commit()
+
+    def _destroyDatabase(self):
+        c = self.connect()
+        c.execute('DROP TABLE IF EXISTS Authors')
+        c.execute('DROP TABLE IF EXISTS Fragments')
+        c.execute('DROP TABLE IF EXISTS FragmentsContent')
+        self.commit()
+
+    def getTextCount(self):
+        c = self.connect()
+        c.execute("SELECT COUNT(*) FROM Fragments")
+        item = c.fetchone()
+        self.commit()
+        return item[0]
+
+    def getAllTexts(self):
+        c = self.connect()
+        c.execute("SELECT id, content FROM FragmentsContent")
+        items = c.fetchall()
+        self.commit()
+        return items
--- a/src/textprocessor/wordanalysis.py
+++ b/src/textprocessor/wordanalysis.py
@ -1,2 +0,0 @@
-def analyzeWords(text):
-    pass
--- a/src/textutils/init.py
+++ b/src/textutils/init.py
@ -0,0 +1,16 @@
+def fixDiacritics(text):
+    text = text.replace(u'ĭ', 'i')
+    text = text.replace(u'ŭ', 'u')
+    text = text.replace(u'à', 'a')
+    return text
+
+def isValidWord(word):
+    # Alphanumeric => word
+    if word.isalnum():
+        return True
+
+    # Some words might be contractions, which finish/begin with a '
+    if word[1:].isalnum() or word[:-1].isalnum():
+        return True
+
+    return False
--- a/src/tools/test.php
+++ b/src/tools/test.php
@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+<body>
+
+<?php echo "Hello world!"?>
+
+</body>
+</html>