109 lines
3.2 KiB
Python
109 lines
3.2 KiB
Python
import urllib
|
|
from pyquery import PyQuery
|
|
import sqlite3
|
|
import re
|
|
|
|
BASE_URL = "https://ro.wikisource.org"
|
|
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
|
|
def getAuthorList():
|
|
authors = []
|
|
for letter in LETTERS:
|
|
print("Processing link page for letter", letter)
|
|
|
|
# Read index page
|
|
url = BASE_URL + '/wiki/Categorie:Autori-' + letter
|
|
data = urllib.request.urlopen(url).read()
|
|
q = PyQuery(data)
|
|
|
|
for item in q("div.mw-category-generated").find("a"):
|
|
if (item.text.startswith("Autor:")):
|
|
authorname = item.text[6:]
|
|
authorlink = item.attrib['href']
|
|
authors.append((authorname, authorlink))
|
|
return list(set(authors))
|
|
|
|
def getAuthorWikiLink(query):
|
|
wikilink = None
|
|
body = query("div#mw-content-text")
|
|
table = body.find("table")
|
|
for link in table.find("a"):
|
|
if "ro.wikipedia.org" in link.attrib['href']:
|
|
wikilink = link.attrib['href']
|
|
return wikilink
|
|
|
|
def getAuthorLinksList(authorname, query):
|
|
links = []
|
|
body = query("div#mw-content-text")
|
|
for link in body.find("a"):
|
|
address = link.attrib['href']
|
|
ok = True
|
|
if "http" in address:
|
|
ok = False
|
|
if "redlink" in address:
|
|
ok = False
|
|
if "Fi%C8%99ier:" in address:
|
|
ok = False
|
|
if "index.php" in address:
|
|
ok = False
|
|
if address.startswith("#"):
|
|
ok = False
|
|
if "Autor:" in address:
|
|
ok = False
|
|
if ok:
|
|
links.append(link.attrib['href'])
|
|
return links
|
|
|
|
def getAuthorBasicInfo(authorname, authorlink):
|
|
info = {}
|
|
data = urllib.request.urlopen(BASE_URL + authorlink).read()
|
|
q = PyQuery(data)
|
|
|
|
info["wiki"] = getAuthorWikiLink(q)
|
|
info["links"] = getAuthorLinksList(authorname, q)
|
|
|
|
return info
|
|
|
|
def getText(url):
|
|
data = urllib.request.urlopen(BASE_URL + url).read()
|
|
q = PyQuery(data)
|
|
|
|
texttitle = q("h1").text()
|
|
|
|
body = q("#mw-content-text")
|
|
body.find("table").remove()
|
|
|
|
textcontent = body.text()
|
|
return (texttitle, textcontent)
|
|
|
|
def addAuthorToDb(authorinfo):
|
|
con = sqlite3.connect("data/texts.db")
|
|
c = con.cursor()
|
|
c.execute("INSERT INTO Authors")
|
|
|
|
def getAllTexts():
|
|
|
|
con = sqlite3.connect("data/texts.db")
|
|
c = con.cursor()
|
|
#c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT")
|
|
id = 1
|
|
|
|
authors = getAuthorList()
|
|
for authorname, authorlink in authors:
|
|
print("Processing author", authorname)
|
|
authorinfo = getAuthorBasicInfo(authorname, authorlink)
|
|
c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"]))
|
|
|
|
# authorinfo = getAuthorWikiInfo(authorinfo)
|
|
for text in authorinfo["links"]:
|
|
try:
|
|
title, content = getText(text)
|
|
c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname))
|
|
c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content))
|
|
id = id + 1
|
|
except urllib.error.HTTPError:
|
|
continue
|
|
|
|
con.commit()
|
|
|
|
getAllTexts() |