Files
roauthorship/src/step0_acquire/wikisource_downloader.py

109 lines
3.2 KiB
Python

import urllib
from pyquery import PyQuery
import sqlite3
import re
BASE_URL = "https://ro.wikisource.org"
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def getAuthorList():
authors = []
for letter in LETTERS:
print("Processing link page for letter", letter)
# Read index page
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
data = urllib.request.urlopen(url).read()
q = PyQuery(data)
for item in q("div.mw-category-generated").find("a"):
if (item.text.startswith("Autor:")):
authorname = item.text[6:]
authorlink = item.attrib['href']
authors.append((authorname, authorlink))
return list(set(authors))
def getAuthorWikiLink(query):
wikilink = None
body = query("div#mw-content-text")
table = body.find("table")
for link in table.find("a"):
if "ro.wikipedia.org" in link.attrib['href']:
wikilink = link.attrib['href']
return wikilink
def getAuthorLinksList(authorname, query):
links = []
body = query("div#mw-content-text")
for link in body.find("a"):
address = link.attrib['href']
ok = True
if "http" in address:
ok = False
if "redlink" in address:
ok = False
if "Fi%C8%99ier:" in address:
ok = False
if "index.php" in address:
ok = False
if address.startswith("#"):
ok = False
if "Autor:" in address:
ok = False
if ok:
links.append(link.attrib['href'])
return links
def getAuthorBasicInfo(authorname, authorlink):
info = {}
data = urllib.request.urlopen(BASE_URL + authorlink).read()
q = PyQuery(data)
info["wiki"] = getAuthorWikiLink(q)
info["links"] = getAuthorLinksList(authorname, q)
return info
def getText(url):
data = urllib.request.urlopen(BASE_URL + url).read()
q = PyQuery(data)
texttitle = q("h1").text()
body = q("#mw-content-text")
body.find("table").remove()
textcontent = body.text()
return (texttitle, textcontent)
def addAuthorToDb(authorinfo):
con = sqlite3.connect("data/texts.db")
c = con.cursor()
c.execute("INSERT INTO Authors")
def getAllTexts():
con = sqlite3.connect("data/texts.db")
c = con.cursor()
#c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
id = 1
authors = getAuthorList()
for authorname, authorlink in authors:
print("Processing author", authorname)
authorinfo = getAuthorBasicInfo(authorname, authorlink)
c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
# authorinfo = getAuthorWikiInfo(authorinfo)
for text in authorinfo["links"]:
try:
title, content = getText(text)
c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
id = id + 1
except urllib.error.HTTPError:
continue
con.commit()
getAllTexts()