47 lines
1.3 KiB
Python
47 lines
1.3 KiB
Python
import urllib
|
|
from pyquery import PyQuery
|
|
|
|
BASE_URL = "https://ro.wikisource.org"
|
|
BASE_INDEX = urllib.request.quote("/wiki/Categorie:Autori_în_ordine_alfabetică")
|
|
|
|
def buildAuthorList():
|
|
authors = []
|
|
indexPages = [ BASE_INDEX ]
|
|
processedPages = []
|
|
|
|
while len(indexPages) > 0:
|
|
|
|
currentIndexPage = indexPages.pop(0)
|
|
|
|
# Avoid infinite loops by avoiding re-processing already processed pages
|
|
if currentIndexPage in processedPages:
|
|
continue
|
|
|
|
processedPages.append(currentIndexPage)
|
|
|
|
# Read page
|
|
print("Index page: ", currentIndexPage)
|
|
# currentIndexPage = urllib.request.quote(currentIndexPage)
|
|
data = urllib.request.urlopen(BASE_URL + currentIndexPage).read()
|
|
q = PyQuery(data)
|
|
|
|
for link in q("a"):
|
|
if 'href' in link.attrib:
|
|
linkaddr = link.attrib['href']
|
|
|
|
if "index.php" in linkaddr:
|
|
continue
|
|
|
|
if "wiki/Autor:" in linkaddr:
|
|
authors.append(linkaddr)
|
|
print("Autor: ", linkaddr)
|
|
|
|
if "wiki/Categorie:Autori-" in linkaddr:
|
|
indexPages.append(linkaddr)
|
|
|
|
# remove duplicates
|
|
authors = list(set(authors))
|
|
return authors
|
|
|
|
authors = buildAuthorList()
|
|
print(authors) |