Refactored code; organized letter and word metrics
This commit is contained in:
109
src/step0_acquire/wikisource_downloader.py
Normal file
109
src/step0_acquire/wikisource_downloader.py
Normal file
@ -0,0 +1,109 @@
|
||||
import urllib
|
||||
from pyquery import PyQuery
|
||||
import sqlite3
|
||||
import re
|
||||
|
||||
BASE_URL = "https://ro.wikisource.org"
|
||||
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
|
||||
def getAuthorList():
|
||||
authors = []
|
||||
for letter in LETTERS:
|
||||
print("Processing link page for letter", letter)
|
||||
|
||||
# Read index page
|
||||
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
|
||||
data = urllib.request.urlopen(url).read()
|
||||
q = PyQuery(data)
|
||||
|
||||
for item in q("div.mw-category-generated").find("a"):
|
||||
if (item.text.startswith("Autor:")):
|
||||
authorname = item.text[6:]
|
||||
authorlink = item.attrib['href']
|
||||
authors.append((authorname, authorlink))
|
||||
return list(set(authors))
|
||||
|
||||
def getAuthorWikiLink(query):
|
||||
wikilink = None
|
||||
body = query("div#mw-content-text")
|
||||
table = body.find("table")
|
||||
for link in table.find("a"):
|
||||
if "ro.wikipedia.org" in link.attrib['href']:
|
||||
wikilink = link.attrib['href']
|
||||
return wikilink
|
||||
|
||||
def getAuthorLinksList(authorname, query):
|
||||
links = []
|
||||
body = query("div#mw-content-text")
|
||||
for link in body.find("a"):
|
||||
address = link.attrib['href']
|
||||
ok = True
|
||||
if "http" in address:
|
||||
ok = False
|
||||
if "redlink" in address:
|
||||
ok = False
|
||||
if "Fi%C8%99ier:" in address:
|
||||
ok = False
|
||||
if "index.php" in address:
|
||||
ok = False
|
||||
if address.startswith("#"):
|
||||
ok = False
|
||||
if "Autor:" in address:
|
||||
ok = False
|
||||
if ok:
|
||||
links.append(link.attrib['href'])
|
||||
return links
|
||||
|
||||
def getAuthorBasicInfo(authorname, authorlink):
|
||||
info = {}
|
||||
data = urllib.request.urlopen(BASE_URL + authorlink).read()
|
||||
q = PyQuery(data)
|
||||
|
||||
info["wiki"] = getAuthorWikiLink(q)
|
||||
info["links"] = getAuthorLinksList(authorname, q)
|
||||
|
||||
return info
|
||||
|
||||
def getText(url):
|
||||
data = urllib.request.urlopen(BASE_URL + url).read()
|
||||
q = PyQuery(data)
|
||||
|
||||
texttitle = q("h1").text()
|
||||
|
||||
body = q("#mw-content-text")
|
||||
body.find("table").remove()
|
||||
|
||||
textcontent = body.text()
|
||||
return (texttitle, textcontent)
|
||||
|
||||
def addAuthorToDb(authorinfo):
|
||||
con = sqlite3.connect("data/texts.db")
|
||||
c = con.cursor()
|
||||
c.execute("INSERT INTO Authors")
|
||||
|
||||
def getAllTexts():
|
||||
|
||||
con = sqlite3.connect("data/texts.db")
|
||||
c = con.cursor()
|
||||
#c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
|
||||
id = 1
|
||||
|
||||
authors = getAuthorList()
|
||||
for authorname, authorlink in authors:
|
||||
print("Processing author", authorname)
|
||||
authorinfo = getAuthorBasicInfo(authorname, authorlink)
|
||||
c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
|
||||
|
||||
# authorinfo = getAuthorWikiInfo(authorinfo)
|
||||
for text in authorinfo["links"]:
|
||||
try:
|
||||
title, content = getText(text)
|
||||
c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
|
||||
c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
|
||||
id = id + 1
|
||||
except urllib.error.HTTPError:
|
||||
continue
|
||||
|
||||
con.commit()
|
||||
|
||||
getAllTexts()
|
Reference in New Issue
Block a user