Initial commit

This commit is contained in:
2017-05-23 13:57:53 +03:00
commit 6badfbd103
38 changed files with 1286 additions and 0 deletions

20
src/logger.py Normal file
View File

@ -0,0 +1,20 @@
import time
import logging
def init_logger(level):
# Log filename
tm = time.strftime('%Y-%m-%d_%H-%M-%S')
logFile = "logs/log_{0}.log".format(tm)
# Set up file logger
logging.basicConfig(filename=logFile,
level=logging.DEBUG,
format='%(asctime)s %(name)s %(levelname)s %(message)s',
datefmt='%m-%d %H:%M')
# Set up console logger
formatter = logging.Formatter('[%(name)s] %(levelname)s: %(message)s')
console = logging.StreamHandler()
console.setLevel(level)
console.setFormatter(formatter)
logging.getLogger().addHandler(console)

40
src/main.py Normal file
View File

@ -0,0 +1,40 @@
import logging
import time
# own
import logger
import storage.data
import storage.results
import textprocessor.letterfreq
import ttl.ttlparser
import ttl.ttlservice
def init():
logger.init_logger(logging.WARNING)
storage.data.initializeFragmentDatabase("data/texts.db")
storage.results.initializeResultsDatabase("data/results.db", True)
def processTexts():
count = storage.data.getTextCount()
current = 0
for item in storage.data.getAllTexts():
print("Processing item", current, "out of", count)
current = current + 1
itemid = item[0]
itemtext = item[1]
# obtain ttl analysis
# unfeasable - it takes 5-10 minutes for a single text
# ttlResult = ttl.ttlservice.executeTtl(itemtext)
# (words, chunks) = ttl.ttlparser.parseText(ttlResult)
# storage.results.storeTtlAnalysis(itemid, words)
# perform analysis
letterFreq = textprocessor.letterfreq.letterFrequencies(itemtext)
storage.results.storeFrequencies(itemid, letterFreq)
print("Finished!")
init()
processTexts()

32
src/model.py Normal file
View File

@ -0,0 +1,32 @@
# Defines a fragment author
class Author:
def __init__(self, name = "", birthYear = "", location = "Romania"):
self.name = name
self.yearOfBirth = birthYear
self.location = location
def __str__(self):
return self.name
def __repr__(self):
return self.name
def dump(self):
return "[Author name={0} yearOfBirth={1} location={2}]".format(self.name, self.yearOfBirth, self.location)
# Defines a text fragment
class Fragment:
def __init__(self, title = "", text = "", author = Author(), year = 1999):
self.title = title
self.text = text
self.author = author
self.year = year
def __str__(self):
return self.title
def __repr__(self):
return self.title
def dump(self):
return "[Fragment title={0} author={1} year={2} text={3}]".format(self.title, self.author.dump(), self.year, self.text)

87
src/model/Word.py Normal file
View File

@ -0,0 +1,87 @@
# Defines a processed word
class Word:
text = ""
lemma = ""
ana = ""
chunk = ""
sentenceIndex = 0
wordIndex = 0
def __init__(self, text, lemma, ana, chunk, sentenceIndex, wordIndex):
self.text = text
self.lemma = lemma
self.ana = ana
self.chunk = chunk
self.sentenceIndex = sentenceIndex
self.wordIndex = wordIndex
def __str__(self):
return "{0} (lemma {1}, ana {2}, chunk {3})".format(self.text, self.lemma, self.ana, self.chunk)
def __repr__(self):
return str(self)
def isNoun(self):
return self.ana[0] == "N"
def nounIsCommon(self):
return self.isNoun() and self.ana[1] == "c"
def nounIsProper(self):
return self.isNoun() and self.ana[1] == "p"
def nounGetCase(self):
if self.isNoun():
return self.ana[4]
return None
'Este articulat?'
def nounIsDefinite(self):
if self.isNoun():
if (self.nounIsProper()):
return True
if len(self.ana) > 5:
return self.ana[5]
return "n"
def pronounGetPerson(self):
if self.isPronoun():
return self.ana[2]
def getGender(self):
if self.isNoun():
if (len(self.ana) >= 3):
return self.ana[2]
return 'n'
if self.isPronoun():
return self.ana[3]
return None
def getNumber(self):
if self.isNoun():
if self.nounIsProper():
return 's'
else:
return self.ana[3]
if self.isPronoun():
return self.ana[4]
return None
def isPronoun(self):
return self.ana[0] == "P"
def isVerb(self):
return self.ana[0] == "V"
def isPreposition(self):
return self.ana[0] == "S" and self.ana[1] == "p"

0
src/model/__init__.py Normal file
View File

0
src/storage/__init__.py Normal file
View File

80
src/storage/data.py Normal file
View File

@ -0,0 +1,80 @@
import logging
import os
from model import *
import sqlite3
log = logging.getLogger("storage")
DB_FRAGMENTS = ""
# Commands
# birth location - general area, not exact location (i.e. Transylvania)
# birth origin - rural or urban
# studies - masters, bachelors, high school, middle school, primary school
# occupation - comma separated if there are multiple
# studiesAbroad - foreign cities where author studied (comma separated)
COMMAND_CREATE_AUTHORS = """CREATE TABLE Authors (
name TEXT PRIMARY KEY,
birthYear INTEGER,
birthLocation TEXT,
birthOrigin TEXT,
studies TEXT,
occupations TEXT,
studiesAbroad TEXT
)"""
# genre - short story (nuvela), novel (roman), poem etc
# movement - literary movement (submovements separated by /) (i.e. realism/naturalism)
# tags - other relevant information (i.e. psychological)
COMMAND_CREATE_FRAGMENTS = """CREATE TABLE Fragments (
id INTEGER PRIMARY KEY,
title TEXT,
year INTEGER,
author TEXT REFERENCES Authors(name),
genre TEXT,
movement TEXT,
tags TEXT
)"""
# contains the actual text
COMMAND_CREATE_FRAGMENTS_CONTENT = """CREATE TABLE FragmentsContent (
id INTEGER REFERENCES Fragments(id),
content TEXT
)"""
# Initialize databases
def initializeFragmentDatabase(dbFile):
global DB_FRAGMENTS
DB_FRAGMENTS = dbFile
if not os.path.exists(dbFile):
log.info("Text database %s not found. Will create database.", dbFile)
con = sqlite3.connect(dbFile)
c = con.cursor()
c.execute(COMMAND_CREATE_AUTHORS)
c.execute(COMMAND_CREATE_FRAGMENTS)
c.execute(COMMAND_CREATE_FRAGMENTS_CONTENT)
con.commit()
con.close()
log.info("Database created!")
def getTextCount():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT COUNT(*) FROM Fragments")
item = c.fetchone()
c.close()
con.close()
return item[0]
def getAllTexts():
con = sqlite3.connect(DB_FRAGMENTS)
c = con.cursor()
c.execute("SELECT id, content FROM FragmentsContent")
items = c.fetchall()
c.close()
con.close()
return items

84
src/storage/results.py Normal file
View File

@ -0,0 +1,84 @@
import logging
import os
from model.Word import *
import sqlite3
log = logging.getLogger("storage")
DB_RESULTS = ""
COMMAND_CREATE_LETTER_FREQUENCIES = """CREATE TABLE LetterFrequencies (
idtext INTEGER,
lettergroup TEXT,
category TEXT,
frequency REAL
)"""
COMMAND_CREATE_TEXT_WORDS = """CREATE TABLE TextWords (
idtext INTEGER,
wordIndex INTEGER,
sentenceIndex INTEGER,
word TEXT,
lemma TEXT,
analysis TEXT,
chunk TEXT
)"""
# COMMAND_CREATE_WORDLENGTH_HISTOGRAM = """CREATE TABLE WordLengthHistogram (
# idtext INTEGER,
# wordlength INTEGER,
# frequency REAL
# )"""
def initializeResultsDatabase(dbFile, cleanupOldData):
global DB_RESULTS
DB_RESULTS = dbFile
# cleanup old data
if cleanupOldData:
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
try:
c.execute("DROP TABLE LetterFrequencies")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_LETTER_FREQUENCIES)
try:
c.execute("DROP TABLE TextWords")
except sqlite3.OperationalError:
pass
c.execute(COMMAND_CREATE_TEXT_WORDS)
con.commit()
c.close()
con.close()
def storeFrequencies(idtext, freq):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# add data
chr = ['p', 'l1', 'l2', 'l3']
for i in range(4):
for let, fr in freq[i]:
c.execute("INSERT INTO LetterFrequencies VALUES (?, ?, ?, ?)", (idtext, let, chr[i], fr))
con.commit()
c.close()
con.close()
def storeTtlAnalysis(idtext, words):
con = sqlite3.connect(DB_RESULTS)
c = con.cursor()
# store words
for word in words:
c.execute("INSERT INTO TextWords VALUES (?, ?, ?, ?, ?, ?, ?)", (idtext, word.wordIndex, word.sentenceIndex, word.text, word.lemma, word.ana, word.chunk))
# finish
con.commit()
c.close()
con.close()

14
src/test.py Normal file
View File

@ -0,0 +1,14 @@
# coding: utf-8
from ttl import ttlservice
from ttl import ttlparser
import nltk
import storage
data = storage.parseIndex("data")
print(data)
#textXml = ttlservice.executeTtl(u"Numele meu este Tibi și îmi place să cânt la chitară bass. Ce faci?")
#words, chunks = ttlparser.parseText(textXml)
#print ("Words: ", words)
#print ("Chunks: ", chunks)

View File

View File

@ -0,0 +1,38 @@
import operator
import storage
def letterFrequencies(text):
letterfreq = [{}, {}, {}, {}]
lettersum = [0, 0, 0, 0]
n = len(text)
for i in range(n):
# compute substring frequency
# l = substring length
for l in range(1, 4):
sub = text[i : i + l].lower()
if len(sub) == l and sub.isalnum():
lettersum[l] += 1
if not sub in letterfreq[l]:
letterfreq[l][sub] = 1
else:
letterfreq[l][sub] += 1
# compute punctuation frequency
chr = text[i]
if not chr.isalnum() and not chr.isspace() and chr.isprintable():
lettersum[0] += 1
if not chr in letterfreq[0]:
letterfreq[0][chr] = 1
else:
letterfreq[0][chr] += 1
# Almost done. Sort and remove irrelevant items (with low frequency), and normalize data
for i in range(4):
freqSorted = sorted(letterfreq[i].items(), key=operator.itemgetter(1), reverse=True)
freqFiltered = freqSorted[0:50]
freqNormalized = [(symbol, freq / lettersum[i]) for symbol, freq in freqFiltered]
letterfreq[i] = freqNormalized
return letterfreq

View File

@ -0,0 +1,2 @@
def analyzeWords(text):
pass

View File

@ -0,0 +1,128 @@
import urllib
from pyquery import PyQuery
import sqlite3
import re
BASE_URL = "https://ro.wikisource.org"
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def getAuthorList():
authors = []
for letter in LETTERS:
print("Processing link page for letter", letter)
# Read index page
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
data = urllib.request.urlopen(url).read()
q = PyQuery(data)
for item in q("div.mw-category-generated").find("a"):
if (item.text.startswith("Autor:")):
authorname = item.text[6:]
authorlink = item.attrib['href']
authors.append((authorname, authorlink))
return list(set(authors))
def getAuthorWikiLink(query):
wikilink = None
body = query("div#mw-content-text")
table = body.find("table")
for link in table.find("a"):
if "ro.wikipedia.org" in link.attrib['href']:
wikilink = link.attrib['href']
return wikilink
def getAuthorLinksList(authorname, query):
links = []
body = query("div#mw-content-text")
for link in body.find("a"):
address = link.attrib['href']
ok = True
if "http" in address:
ok = False
if "redlink" in address:
ok = False
if "Fi%C8%99ier:" in address:
ok = False
if "index.php" in address:
ok = False
if address.startswith("#"):
ok = False
if "Autor:" in address:
ok = False
if ok:
links.append(link.attrib['href'])
return links
def getAuthorBasicInfo(authorname, authorlink):
info = {}
data = urllib.request.urlopen(BASE_URL + authorlink).read()
q = PyQuery(data)
info["wiki"] = getAuthorWikiLink(q)
info["links"] = getAuthorLinksList(authorname, q)
return info
# def getAuthorWikiInfo(authorinfo):
# # Nothing can be learned without wiki page
# if authorinfo["wiki"] is None:
# return authorinfo
# try:
# data = urllib.request.urlopen(authorinfo["wiki"]).read()
# q = PyQuery(data)
# # Find the birth date
# body = q("#mw-content-text").text()
# result = re.compile(u"Născut\s+([\w\s]+)").match(body)
# if not result is None:
# authorinfo["birthyear"] = result.group(0)
# except urllib.error.HTTPError:
# pass
# return authorinfo
def getText(url):
data = urllib.request.urlopen(BASE_URL + url).read()
q = PyQuery(data)
texttitle = q("h1").text()
body = q("#mw-content-text")
body.find("table").remove()
textcontent = body.text()
return (texttitle, textcontent)
def addAuthorToDb(authorinfo):
con = sqlite3.connect("data/texts.db")
c = con.cursor()
c.execute("INSERT INTO Authors")
def getAllTexts():
con = sqlite3.connect("data/texts.db")
c = con.cursor()
#c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
id = 1
authors = getAuthorList()
for authorname, authorlink in authors:
print("Processing author", authorname)
authorinfo = getAuthorBasicInfo(authorname, authorlink)
c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
# authorinfo = getAuthorWikiInfo(authorinfo)
for text in authorinfo["links"]:
try:
title, content = getText(text)
c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
id = id + 1
except urllib.error.HTTPError:
continue
con.commit()
getAllTexts()

0
src/ttl/__init__.py Normal file
View File

62
src/ttl/ttlparser.py Normal file
View File

@ -0,0 +1,62 @@
'''
Created on May 22, 2016
@author: tibi
'''
from xml.dom import minidom;
from xml.parsers.expat import ExpatError
from model.Word import Word
def parseText(xmlText):
words = []
chunks = {}
sentence_i = 0
# get the root "segs" element
try:
dom = minidom.parseString(xmlText)
except ExpatError as e:
print("Error in text:", xmlText)
print(e)
exit(-1)
alltext = dom.getElementsByTagName("segs")
# iterate paragraphs
for paragraph in alltext[0].getElementsByTagName("seg"):
# iterate sentences
for sentence in paragraph.getElementsByTagName("s"):
# increment sentence index
sentence_i += 1
word_i = 0
# iterate words
for word in sentence.getElementsByTagName("w"):
# increment word index
word_i += 1
# obtain word info
wordText = word.firstChild.data
lemma = word.getAttribute("lemma")
ana = word.getAttribute("ana")
chunk = word.getAttribute("chunk")
# create word
#w = Word(wordText, lemma, ana, chunk, sentence_i, word_i)
#words.append(w)
for c in chunk.split(","):
w = Word(wordText, lemma, ana, c, sentence_i, word_i)
words.append(w)
if chunks.get((sentence_i, c)) == None:
chunks[(sentence_i, c)] = [ w ]
else:
chunks[(sentence_i, c)].append(w)
return (words, chunks)

34
src/ttl/ttlservice.py Normal file
View File

@ -0,0 +1,34 @@
# coding: utf-8
import zeep
def executeTtl(text):
# Preprocess the text
text = text.replace(u'ĭ', 'i')
text = text.replace(u'ŭ', 'u')
text = text.replace(u'à', 'a')
client = zeep.Client("http://ws.racai.ro/ttlws.wsdl")
textSgml = client.service.UTF8toSGML(text)
result = client.service.XCES("ro", "id", textSgml)
# Cleanup result - generate valid xml
result = result.replace('’', '`')
result = result.replace('ă', u'ă')
result = result.replace('à', u'à')
result = result.replace('â', u'â')
result = result.replace('î', u'î')
result = result.replace('ş', u'ș')
result = result.replace('ţ', u'ț')
result = result.replace('ŭ', u'u')
result = result.replace('Ă', u'Ă')
result = result.replace('À', u'À')
result = result.replace('Â', u'Â')
result = result.replace('Î', u'Î')
result = result.replace('Ş', u'Ș')
result = result.replace('Ţ', u'Ț')
result = result.replace('Ŭ', u'U')
xmlResult = "<?xml version=\"1.0\" encoding=\"utf-8\" ?><segs>"
xmlResult += result
xmlResult += "</segs>"
return xmlResult