roauthorship/src/step0_acquire/wikisource/author_list.py

47 lines
1.3 KiB
Python

import urllib
from pyquery import PyQuery
BASE_URL = "https://ro.wikisource.org"
BASE_INDEX = urllib.request.quote("/wiki/Categorie:Autori_în_ordine_alfabetică")
def buildAuthorList():
authors = []
indexPages = [ BASE_INDEX ]
processedPages = []
while len(indexPages) > 0:
currentIndexPage = indexPages.pop(0)
# Avoid infinite loops by avoiding re-processing already processed pages
if currentIndexPage in processedPages:
continue
processedPages.append(currentIndexPage)
# Read page
print("Index page: ", currentIndexPage)
# currentIndexPage = urllib.request.quote(currentIndexPage)
data = urllib.request.urlopen(BASE_URL + currentIndexPage).read()
q = PyQuery(data)
for link in q("a"):
if 'href' in link.attrib:
linkaddr = link.attrib['href']
if "index.php" in linkaddr:
continue
if "wiki/Autor:" in linkaddr:
authors.append(linkaddr)
print("Autor: ", linkaddr)
if "wiki/Categorie:Autori-" in linkaddr:
indexPages.append(linkaddr)
# remove duplicates
authors = list(set(authors))
return authors
authors = buildAuthorList()
print(authors)