import urllib from pyquery import PyQuery import sqlite3 import re BASE_URL = "https://ro.wikisource.org" LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" def getAuthorList(): authors = [] for letter in LETTERS: print("Processing link page for letter", letter) # Read index page url = BASE_URL + '/wiki/Categorie:Autori-' + letter data = urllib.request.urlopen(url).read() q = PyQuery(data) for item in q("div.mw-category-generated").find("a"): if (item.text.startswith("Autor:")): authorname = item.text[6:] authorlink = item.attrib['href'] authors.append((authorname, authorlink)) return list(set(authors)) def getAuthorWikiLink(query): wikilink = None body = query("div#mw-content-text") table = body.find("table") for link in table.find("a"): if "ro.wikipedia.org" in link.attrib['href']: wikilink = link.attrib['href'] return wikilink def getAuthorLinksList(authorname, query): links = [] body = query("div#mw-content-text") for link in body.find("a"): address = link.attrib['href'] ok = True if "http" in address: ok = False if "redlink" in address: ok = False if "Fi%C8%99ier:" in address: ok = False if "index.php" in address: ok = False if address.startswith("#"): ok = False if "Autor:" in address: ok = False if ok: links.append(link.attrib['href']) return links def getAuthorBasicInfo(authorname, authorlink): info = {} data = urllib.request.urlopen(BASE_URL + authorlink).read() q = PyQuery(data) info["wiki"] = getAuthorWikiLink(q) info["links"] = getAuthorLinksList(authorname, q) return info def getText(url): data = urllib.request.urlopen(BASE_URL + url).read() q = PyQuery(data) texttitle = q("h1").text() body = q("#mw-content-text") body.find("table").remove() textcontent = body.text() return (texttitle, textcontent) def addAuthorToDb(authorinfo): con = sqlite3.connect("data/texts.db") c = con.cursor() c.execute("INSERT INTO Authors") def getAllTexts(): con = sqlite3.connect("data/texts.db") c = con.cursor() #c.execute("ALTER TABLE Authors ADD COLUMN wiki TEXT") id = 1 authors = getAuthorList() for authorname, authorlink in authors: print("Processing author", authorname) authorinfo = getAuthorBasicInfo(authorname, authorlink) c.execute("INSERT INTO Authors(name,wiki) VALUES(?, ?)", (authorname, authorinfo["wiki"])) # authorinfo = getAuthorWikiInfo(authorinfo) for text in authorinfo["links"]: try: title, content = getText(text) c.execute("INSERT INTO Fragments(id, title, author) VALUES (?, ?, ?)", (id, title, authorname)) c.execute("INSERT INTO FragmentsContent(id, content) VALUES (?, ?)", (id, content)) id = id + 1 except urllib.error.HTTPError: continue con.commit() getAllTexts()