diff --git a/data/results.db b/data/results.db index f0bae2b..7fde558 100644 Binary files a/data/results.db and b/data/results.db differ diff --git a/papers/AuthorshipDetection.fdb_latexmk b/papers/AuthorshipDetection.fdb_latexmk index 5096a36..c9c6118 100644 --- a/papers/AuthorshipDetection.fdb_latexmk +++ b/papers/AuthorshipDetection.fdb_latexmk @@ -1,17 +1,17 @@ # Fdb version 3 -["bibtex AuthorshipDetection"] 1495653012 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1495653014 - "AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a "" +["bibtex AuthorshipDetection"] 1496507489 "AuthorshipDetection.aux" "AuthorshipDetection.bbl" "AuthorshipDetection" 1496507491 + "AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a "" "C:/Program Files/MiKTeX 2.9/bibtex/bst/bibtex/plain.bst" 1291868336 20613 bd3fbfa9f64872b81ac57a0dd2ed855f "" "bibliography/bibliography.bib" 1495652619 244 59c2a4b6607d9d24b3cfb7d66faed6a6 "" (generated) "AuthorshipDetection.bbl" "AuthorshipDetection.blg" -["pdflatex"] 1495653013 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1495653014 - "AuthorshipDetection.aux" 1495653014 1103 ccc8f941d3406278b8df384b7972f88a "" - "AuthorshipDetection.bbl" 1495653013 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection" - "AuthorshipDetection.tdo" 1495653014 149 5ef486a5a64ba9d20e77900a17e38ab8 "" +["pdflatex"] 1496507489 "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" "AuthorshipDetection.pdf" "AuthorshipDetection" 1496507491 + "AuthorshipDetection.aux" 1496507490 1103 ccc8f941d3406278b8df384b7972f88a "" + "AuthorshipDetection.bbl" 1496507489 207 ce85d772213e84afc1a264255c65ba4e "bibtex AuthorshipDetection" + "AuthorshipDetection.tdo" 1496507490 149 5ef486a5a64ba9d20e77900a17e38ab8 "" "AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c "" - "AuthorshipDetection.toc" 1495653014 354 93710fd8d0aa2019c18a188df168978a "" + "AuthorshipDetection.toc" 1496507490 354 93710fd8d0aa2019c18a188df168978a "" "C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx10.tfm" 1136768653 1328 c834bbb027764024c09d3d2bf908b5f0 "" "C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmbx12.tfm" 1136768653 1324 c910af8c371558dc20f2d7822f66fe64 "" "C:/Program Files/MiKTeX 2.9/fonts/tfm/public/cm/cmmi12.tfm" 1136768653 1524 4414a8315f39513458b80dfc63bff03a "" @@ -126,15 +126,15 @@ "C:/Program Files/MiKTeX 2.9/tex/latex/tools/calc.sty" 1492423194 10503 d03d065f799d54f6b7e9b175f8d84279 "" "C:/Program Files/MiKTeX 2.9/tex/latex/xcolor/xcolor.sty" 1463135581 57049 34128738f682d033422ca125f82e5d62 "" "C:/Program Files/MiKTeX 2.9/tex/latex/xkeyval/xkeyval.sty" 1419274338 5114 9c1069474ff71dbc47d5006555e352d3 "" - "C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495650797 80423 3cfba73105275d3ad410701d89fbdf7a "" + "C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map" 1495981403 80423 3cfba73105275d3ad410701d89fbdf7a "" "C:/Users/Tibi/AppData/Local/MiKTeX/2.9/miktex/data/le/pdftex/pdflatex.fmt" 1495550378 4001358 752f924a412e4944af4e01c5a9beae77 "" "chapters/abstract.tex" 1495553267 495 58ad3ecfec349d84d898cef65bea34a8 "" "chapters/introduction.tex" 1495651567 1996 d9cf7ce732c566423ec6b6f8f56fcd7e "" "chapters/previouswork.tex" 1495652541 29 112f1954c35a5d96c415f13d34bcd056 "" "d:\Facultate\Dizertatie\papers\AuthorshipDetection.tex" 1495653011 1176 1eb61d9b5d736225f78429976651335c "" (generated) + "AuthorshipDetection.pdf" + "AuthorshipDetection.log" + "AuthorshipDetection.tdo" "AuthorshipDetection.aux" "AuthorshipDetection.toc" - "AuthorshipDetection.log" - "AuthorshipDetection.pdf" - "AuthorshipDetection.tdo" diff --git a/papers/AuthorshipDetection.pdf b/papers/AuthorshipDetection.pdf index 861b407..e7f677a 100644 Binary files a/papers/AuthorshipDetection.pdf and b/papers/AuthorshipDetection.pdf differ diff --git a/papers/AuthorshipDetection.synctex.gz b/papers/AuthorshipDetection.synctex.gz new file mode 100644 index 0000000..b817768 Binary files /dev/null and b/papers/AuthorshipDetection.synctex.gz differ diff --git a/papers/AuthorshipDetection.synctex.gz(busy) b/papers/AuthorshipDetection.synctex.gz(busy) deleted file mode 100644 index b3ab407..0000000 Binary files a/papers/AuthorshipDetection.synctex.gz(busy) and /dev/null differ diff --git a/src/exec.py b/src/exec.py new file mode 100644 index 0000000..9e448ea --- /dev/null +++ b/src/exec.py @@ -0,0 +1,11 @@ +print("Acquiring texts...") +import step0_acquire.wikisource_downloader + +print("Processing letter frequencies... ") +import step1_text_processing.process_letter_frequencies + +print("Processing word frequencies... ") +import step1_text_processing.process_word_frequencies + +print("Processing word lengths... ") +import step1_text_processing.process_word_lengths \ No newline at end of file diff --git a/src/main.py b/src/main.py deleted file mode 100644 index 6634a66..0000000 --- a/src/main.py +++ /dev/null @@ -1,40 +0,0 @@ -import logging -import time -# own -import logger -import storage.data -import storage.results -import textprocessor.letterfreq -import ttl.ttlparser -import ttl.ttlservice - -def init(): - logger.init_logger(logging.WARNING) - storage.data.initializeFragmentDatabase("data/texts.db") - storage.results.initializeResultsDatabase("data/results.db", True) - -def processTexts(): - count = storage.data.getTextCount() - current = 0 - for item in storage.data.getAllTexts(): - print("Processing item", current, "out of", count) - current = current + 1 - - itemid = item[0] - itemtext = item[1] - - # obtain ttl analysis - # unfeasable - it takes 5-10 minutes for a single text - # ttlResult = ttl.ttlservice.executeTtl(itemtext) - # (words, chunks) = ttl.ttlparser.parseText(ttlResult) - # storage.results.storeTtlAnalysis(itemid, words) - - # perform analysis - letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext) - storage.results.storeFrequencies(itemid, letterFreq) - - print("Finished!") - -init() -processTexts() - diff --git a/src/textprocessor/__init__.py b/src/step0_acquire/__init__.py similarity index 100% rename from src/textprocessor/__init__.py rename to src/step0_acquire/__init__.py diff --git a/src/tools/wikisource_downloader.py b/src/step0_acquire/wikisource_downloader.py similarity index 84% rename from src/tools/wikisource_downloader.py rename to src/step0_acquire/wikisource_downloader.py index 2502ab7..213b6cf 100644 --- a/src/tools/wikisource_downloader.py +++ b/src/step0_acquire/wikisource_downloader.py @@ -10,10 +10,12 @@ def getAuthorList(): authors = [] for letter in LETTERS: print("Processing link page for letter", letter) + # Read index page url = BASE_URL + '/wiki/Categorie:Autori-' + letter data = urllib.request.urlopen(url).read() q = PyQuery(data) + for item in q("div.mw-category-generated").find("a"): if (item.text.startswith("Autor:")): authorname = item.text[6:] @@ -62,27 +64,6 @@ def getAuthorBasicInfo(authorname, authorlink): return info -# def getAuthorWikiInfo(authorinfo): - -# # Nothing can be learned without wiki page -# if authorinfo["wiki"] is None: -# return authorinfo - -# try: -# data = urllib.request.urlopen(authorinfo["wiki"]).read() -# q = PyQuery(data) - -# # Find the birth date -# body = q("#mw-content-text").text() -# result = re.compile(u"Născut\s+([\w\s]+)").match(body) -# if not result is None: -# authorinfo["birthyear"] = result.group(0) - -# except urllib.error.HTTPError: -# pass - -# return authorinfo - def getText(url): data = urllib.request.urlopen(BASE_URL + url).read() q = PyQuery(data) diff --git a/src/step1_text_processing/__init__.py b/src/step1_text_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/textprocessor/letterfreq.py b/src/step1_text_processing/process_letter_frequencies.py similarity index 53% rename from src/textprocessor/letterfreq.py rename to src/step1_text_processing/process_letter_frequencies.py index 28f5fb2..afbc71e 100644 --- a/src/textprocessor/letterfreq.py +++ b/src/step1_text_processing/process_letter_frequencies.py @@ -1,5 +1,14 @@ +import logging +import time import operator -import storage +# own +from storage.data import TextStorage +from storage.results.letterFrequencies import LetterFrequencyStorage + +import ttl.ttlparser +import ttl.ttlservice + +FREQUENCY_TRESHOLD = 0.005 def letterFrequencies(text): letterfreq = [{}, {}, {}, {}] @@ -31,8 +40,31 @@ def letterFrequencies(text): # Almost done. Sort and remove irrelevant items (with low frequency), and normalize data for i in range(4): freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True) - freqFiltered = freqSorted[0:50] - freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered] + freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqSorted] + freqFiltered = [(symbol, freq) for symbol, freq in freqNormalized if freq >= FREQUENCY_TRESHOLD] letterfreq[i] = freqNormalized - - return letterfreq \ No newline at end of file + + return letterfreq + +def processTexts(TextStorage, resultsStorage): + count = TextStorage.getTextCount() + current = 0 + for item in TextStorage.getAllTexts(): + print("Processing item", current, "out of", count) + current = current + 1 + + itemid = item[0] + itemtext = item[1] + + # perform analysis + letterFreq = letterFrequencies(itemtext) + resultsStorage.store(itemid, letterFreq) + + print("Finished!") + +def main(): + TextStorage = TextStorage("data/texts.db") + resultsStorage = LetterFrequencyStorage("data/results.db") + processTexts(TextStorage, resultsStorage) + +main() \ No newline at end of file diff --git a/src/step1_text_processing/process_word_frequencies.py b/src/step1_text_processing/process_word_frequencies.py new file mode 100644 index 0000000..51e4f5f --- /dev/null +++ b/src/step1_text_processing/process_word_frequencies.py @@ -0,0 +1,70 @@ +import logging +import time +import operator +import nltk.tokenize +import nltk.tokenize.moses +import nltk.stem.snowball + +# own +from storage.data import TextStorage +from storage.results.wordFrequencies import WordFrequencyStorage +import textutils + +FREQUENCY_TRESHOLD = 0.001 + +def wordFrequencies(text): + text = textutils.fixDiacritics(text) + tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro') + stemmer = nltk.stem.snowball.RomanianStemmer() + + words = tokenizer.tokenize(text) + frequencies = {} + + for word in words: + # Skip non-words + if not textutils.isValidWord(word): + continue + + # use word stem + stem = stemmer.stem(word) + + if stem not in frequencies: + frequencies[stem] = 1 + else: + frequencies[stem] += 1 + + # Normalize + freqSorted = sorted(frequencies.items(), key=operator.itemgetter(1), reverse=True) + + freqNormalized = [] + for word, freq in freqSorted: + freqNorm = float(freq) / len(words) + if freqNorm >= FREQUENCY_TRESHOLD: + freqNormalized.append((word, freqNorm)) + + return freqNormalized + +def processTexts(TextStorage, resultsStorage): + count = TextStorage.getTextCount() + current = 0 + for item in TextStorage.getAllTexts(): + print("Processing item", current, "out of", count) + current = current + 1 + + itemid = item[0] + itemtext = item[1] + + # perform analysis + letterFreq = wordFrequencies(itemtext) + + # store results + resultsStorage.store(itemid, letterFreq) + + print("Finished!") + +def main(): + TextStorage = TextStorage("data/texts.db") + resultsStorage = WordFrequencyStorage("data/results.db") + processTexts(TextStorage, resultsStorage) + +main() \ No newline at end of file diff --git a/src/step1_text_processing/process_word_lengths.py b/src/step1_text_processing/process_word_lengths.py new file mode 100644 index 0000000..b621df2 --- /dev/null +++ b/src/step1_text_processing/process_word_lengths.py @@ -0,0 +1,60 @@ +import logging +import time +import operator +import nltk.tokenize +import nltk.tokenize.moses +import nltk.stem.snowball + +# own +from storage.data import TextStorage +from storage.results.wordLengths import WordLengthStorage +import textutils + +def wordLengths(text): + text = textutils.fixDiacritics(text) + tokenizer = nltk.tokenize.moses.MosesTokenizer(lang='ro') + words = tokenizer.tokenize(text) + + lengths = {} + + for word in words: + # Skip non-words + if not textutils.isValidWord(word): + continue + + l = len(word) + + if l not in lengths: + lengths[l] = 1 + else: + lengths[l] += 1 + + # normalize + norm_lengths = [(length, float(freq) / len(words)) for length, freq in lengths.items()] + + return norm_lengths + +def processTexts(TextStorage, resultsStorage): + count = TextStorage.getTextCount() + current = 0 + for item in TextStorage.getAllTexts(): + print("Processing item", current, "out of", count) + current = current + 1 + + itemid = item[0] + itemtext = item[1] + + # perform analysis + wordLens = wordLengths(itemtext) + + # store results + resultsStorage.store(itemid, wordLens) + + print("Finished!") + +def main(): + TextStorage = TextStorage("data/texts.db") + resultsStorage = WordLengthStorage("data/results.db") + processTexts(TextStorage, resultsStorage) + +main() \ No newline at end of file diff --git a/src/storage/__init__.py b/src/storage/__init__.py index e69de29..e97a7b9 100644 --- a/src/storage/__init__.py +++ b/src/storage/__init__.py @@ -0,0 +1,33 @@ +import os.path +import sqlite3 + +class Storage: + def __init__(self, dbFile): + self.__dbFile = dbFile + self.__initialize() + self.__con = None + self.__cur = None + + def __initialize(self): + self._createDatabase() + + def _createDatabase(self): + pass + + def _destroyDatabase(self): + pass + + def connect(self): + self.__con = sqlite3.connect(self.__dbFile) + self.__cur = self.__con.cursor() + return self.__cur + + def commit(self, doClose=True): + self.__con.commit() + if doClose: + self.__cur.close() + self.__con.close() + + def recreateDatabase(self): + self._destroyDatabase() + self._createDatabase() diff --git a/src/storage/data.py b/src/storage/data.py deleted file mode 100644 index a3a4bcd..0000000 --- a/src/storage/data.py +++ /dev/null @@ -1,80 +0,0 @@ -import logging -import os -from model import * -import sqlite3 - -log = logging.getLogger("storage") - -DB_FRAGMENTS = "" - -# Commands - -# birth location - general area, not exact location (i.e. Transylvania) -# birth origin - rural or urban -# studies - masters, bachelors, high school, middle school, primary school -# occupation - comma separated if there are multiple -# studiesAbroad - foreign cities where author studied (comma separated) -COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors ( - name TEXT PRIMARY KEY, - birthYear INTEGER, - birthLocation TEXT, - birthOrigin TEXT, - studies TEXT, - occupations TEXT, - studiesAbroad TEXT - )""" - -# genre - short story (nuvela), novel (roman), poem etc -# movement - literary movement (submovements separated by /) (i.e. realism/naturalism) -# tags - other relevant information (i.e. psychological) -COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments ( - id INTEGER PRIMARY KEY, - title TEXT, - year INTEGER, - author TEXT REFERENCES Authors(name), - genre TEXT, - movement TEXT, - tags TEXT - )""" - -# contains the actual text -COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent ( - id INTEGER REFERENCES Fragments(id), - content TEXT - )""" - -# Initialize databases -def initializeFragmentDatabase(dbFile): - global DB_FRAGMENTS - DB_FRAGMENTS = dbFile - - if not os.path.exists(dbFile): - log.info("Text database %s not found. Will create database.", dbFile) - con = sqlite3.connect(dbFile) - c = con.cursor() - c.execute(COMMAND_CREATE_AUTHORS) - c.execute(COMMAND_CREATE_FRAGMENTS) - c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT) - con.commit() - con.close() - log.info("Database created!") - -def getTextCount(): - con = sqlite3.connect(DB_FRAGMENTS) - c = con.cursor() - c.execute("SELECT COUNT(*) FROM Fragments") - item = c.fetchone() - c.close() - con.close() - return item[0] - -def getAllTexts(): - con = sqlite3.connect(DB_FRAGMENTS) - c = con.cursor() - c.execute("SELECT id, content FROM FragmentsContent") - - items = c.fetchall() - - c.close() - con.close() - return items \ No newline at end of file diff --git a/src/storage/results.py b/src/storage/results.py deleted file mode 100644 index 6bba18b..0000000 --- a/src/storage/results.py +++ /dev/null @@ -1,84 +0,0 @@ -import logging -import os -from model.Word import * -import sqlite3 - -log = logging.getLogger("storage") - -DB_RESULTS = "" - -COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies ( - idtext INTEGER, - lettergroup TEXT, - category TEXT, - frequency REAL - )""" - -COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords ( - idtext INTEGER, - wordIndex INTEGER, - sentenceIndex INTEGER, - word TEXT, - lemma TEXT, - analysis TEXT, - chunk TEXT - )""" - -# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram ( -# idtext INTEGER, -# wordlength INTEGER, -# frequency REAL -# )""" - -def initializeResultsDatabase(dbFile, cleanupOldData): - global DB_RESULTS - DB_RESULTS = dbFile - - # cleanup old data - if cleanupOldData: - con = sqlite3.connect(DB_RESULTS) - c = con.cursor() - - try: - c.execute("DROP TABLE LetterFrequencies") - except sqlite3.OperationalError: - pass - c.execute(COMMAND_CREATE_LETTER_FREQUENCIES) - - try: - c.execute("DROP TABLE TextWords") - except sqlite3.OperationalError: - pass - c.execute(COMMAND_CREATE_TEXT_WORDS) - - con.commit() - c.close() - con.close() - - -def storeFrequencies(idtext, freq): - con = sqlite3.connect(DB_RESULTS) - c = con.cursor() - - # add data - chr = ['p', 'l1', 'l2', 'l3'] - for i in range(4): - for let, fr in freq[i]: - c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr)) - - con.commit() - c.close() - con.close() - -def storeTtlAnalysis(idtext, words): - con = sqlite3.connect(DB_RESULTS) - c = con.cursor() - - # store words - for word in words: - c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk)) - - # finish - con.commit() - c.close() - con.close() \ No newline at end of file diff --git a/src/storage/results/__init__.py b/src/storage/results/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/storage/results/letterFrequencies.py b/src/storage/results/letterFrequencies.py new file mode 100644 index 0000000..7c28346 --- /dev/null +++ b/src/storage/results/letterFrequencies.py @@ -0,0 +1,27 @@ +import storage + +class LetterFrequencyStorage(storage.Storage): + __COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE IF NOT EXISTS LetterFrequencies ( + idtext INTEGER, + lettergroup TEXT, + category TEXT, + frequency REAL + )""" + + def _createDatabase(self): + c = self.connect() + c.execute(self.__COMMAND_CREATE_LETTER_FREQUENCIES) + self.commit() + + def _destroyDatabase(self): + c = self.connect() + c.execute('DROP TABLE IF EXISTS LetterFrequencies') + self.commit() + + def store(self, idtext, frequencies): + c = self.connect() + chr = ['p', 'l1', 'l2', 'l3'] + for i in range(4): + for let, fr in frequencies[i]: + c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr)) + self.commit() \ No newline at end of file diff --git a/src/storage/results/wordFrequencies.py b/src/storage/results/wordFrequencies.py new file mode 100644 index 0000000..c0c95a4 --- /dev/null +++ b/src/storage/results/wordFrequencies.py @@ -0,0 +1,24 @@ +import storage + +class WordFrequencyStorage(storage.Storage): + __COMMAND_CREATE_WORD_FREQUENCIES = """CREATE TABLE IF NOT EXISTS WordFrequencies ( + idtext INTEGER, + word TEXT, + frequency REAL + )""" + + def _createDatabase(self): + c = self.connect() + c.execute(self.__COMMAND_CREATE_WORD_FREQUENCIES) + self.commit() + + def _destroyDatabase(self): + c = self.connect() + c.execute('DROP TABLE IF EXISTS WordFrequencies') + self.commit() + + def store(self, idtext, frequencies): + c = self.connect() + for word, freq in frequencies: + c.execute('INSERT INTO WordFrequencies VALUES(?, ?, ?)', (idtext, word, freq)) + self.commit() \ No newline at end of file diff --git a/src/storage/results/wordLengths.py b/src/storage/results/wordLengths.py new file mode 100644 index 0000000..a78ec58 --- /dev/null +++ b/src/storage/results/wordLengths.py @@ -0,0 +1,24 @@ +import storage + +class WordLengthStorage(storage.Storage): + __COMMAND_CREATE_WORD_LENGTHS = """CREATE TABLE IF NOT EXISTS WordLengths ( + idtext INTEGER, + wordlength INTEGER, + frequency REAL + )""" + + def _createDatabase(self): + c = self.connect() + c.execute(self.__COMMAND_CREATE_WORD_LENGTHS) + self.commit() + + def _destroyDatabase(self): + c = self.connect() + c.execute('DROP TABLE IF EXISTS WordLengths') + self.commit() + + def store(self, idtext, frequencies): + c = self.connect() + for length, frequency in frequencies: + c.execute("INSERT INTO WordLengths VALUES(?, ?, ?)", (idtext, length, frequency)) + self.commit() \ No newline at end of file diff --git a/src/storage/texts.py b/src/storage/texts.py new file mode 100644 index 0000000..4337973 --- /dev/null +++ b/src/storage/texts.py @@ -0,0 +1,65 @@ +import storage + +class TextStorage(storage.Storage): + + # birth location - general area, not exact location (i.e. Transylvania) + # birth origin - rural or urban + # studies - masters, bachelors, high school, middle school, primary school + # occupation - comma separated if there are multiple + # studiesAbroad - foreign cities where author studied (comma separated) + __COMMAND_CREATE_AUTHORS = """CREATE TABLE IF NOT EXISTS Authors ( + name TEXT PRIMARY KEY, + birthYear INTEGER, + birthLocation TEXT, + birthOrigin TEXT, + studies TEXT, + occupations TEXT, + studiesAbroad TEXT + )""" + + # genre - short story (nuvela), novel (roman), poem etc + # movement - literary movement (submovements separated by /) (i.e. realism/naturalism) + # tags - other relevant information (i.e. psychological) + __COMMAND_CREATE_FRAGMENTS = """CREATE TABLE IF NOT EXISTS Fragments ( + id INTEGER PRIMARY KEY, + title TEXT, + year INTEGER, + author TEXT REFERENCES Authors(name), + genre TEXT, + movement TEXT, + tags TEXT + )""" + + # contains the actual text + __COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE IF NOT EXISTS FragmentsContent ( + id INTEGER REFERENCES Fragments(id), + content TEXT + )""" + + def _createDatabase(self): + c = self.connect() + c.execute(self.__COMMAND_CREATE_AUTHORS) + c.execute(self.__COMMAND_CREATE_FRAGMENTS) + c.execute(self.__COMMAND_CREATE_FRAGMENTS_CONTENT) + self.commit() + + def _destroyDatabase(self): + c = self.connect() + c.execute('DROP TABLE IF EXISTS Authors') + c.execute('DROP TABLE IF EXISTS Fragments') + c.execute('DROP TABLE IF EXISTS FragmentsContent') + self.commit() + + def getTextCount(self): + c = self.connect() + c.execute("SELECT COUNT(*) FROM Fragments") + item = c.fetchone() + self.commit() + return item[0] + + def getAllTexts(self): + c = self.connect() + c.execute("SELECT id, content FROM FragmentsContent") + items = c.fetchall() + self.commit() + return items \ No newline at end of file diff --git a/src/textprocessor/wordanalysis.py b/src/textprocessor/wordanalysis.py deleted file mode 100644 index ac2af3a..0000000 --- a/src/textprocessor/wordanalysis.py +++ /dev/null @@ -1,2 +0,0 @@ -def analyzeWords(text): - pass \ No newline at end of file diff --git a/src/textutils/__init__.py b/src/textutils/__init__.py new file mode 100644 index 0000000..d29fdcf --- /dev/null +++ b/src/textutils/__init__.py @@ -0,0 +1,16 @@ +def fixDiacritics(text): + text = text.replace(u'ĭ', 'i') + text = text.replace(u'ŭ', 'u') + text = text.replace(u'à', 'a') + return text + +def isValidWord(word): + # Alphanumeric => word + if word.isalnum(): + return True + + # Some words might be contractions, which finish/begin with a ' + if word[1:].isalnum() or word[:-1].isalnum(): + return True + + return False diff --git a/src/tools/test.php b/src/tools/test.php new file mode 100644 index 0000000..18fbef1 --- /dev/null +++ b/src/tools/test.php @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file