93 lines
2.5 KiB
Python
93 lines
2.5 KiB
Python
import urllib
|
|
from pyquery import PyQuery
|
|
import sqlite3
|
|
import re
|
|
import json
|
|
import requests
|
|
|
|
BASE_URL = "https://ro.wikisource.org"
|
|
API_URL = BASE_URL + "/w/api.php"
|
|
# https://ro.wikisource.org/w/api.php?action=query&titles=&prop=links|wbentityusage
|
|
|
|
def wikisourceQuery(query):
|
|
query["action"] = "query"
|
|
query["format"] = "json"
|
|
|
|
cont = {}
|
|
|
|
# download page
|
|
while True:
|
|
query.update(cont)
|
|
result = requests.get(API_URL, params=query).json()
|
|
|
|
# handle errors and warnings
|
|
if 'error' in result:
|
|
print("Error: ", result['error'])
|
|
|
|
if 'warnings' in result:
|
|
print("Warnings: ", result['warnings'])
|
|
|
|
# return query result
|
|
if 'query' in result:
|
|
yield result['query']
|
|
|
|
# Handle 'continue'
|
|
if 'continue' in result:
|
|
cont = result['continue']
|
|
else:
|
|
break
|
|
|
|
|
|
def getAuthorWorks(authorLink):
|
|
authorLink = authorLink.replace("/wiki/", "")
|
|
|
|
query = {}
|
|
query["titles"] = authorLink
|
|
query["prop"] = "links"
|
|
|
|
for result in wikisourceQuery(query):
|
|
for page in result['pages'].values():
|
|
links = page["links"]
|
|
for link in links:
|
|
yield link["title"]
|
|
|
|
|
|
def getAuthorWikidata(authorLink):
|
|
|
|
jsonLocation = ""
|
|
|
|
# Perform a query to find the correct page
|
|
QUERY_URL = "https://www.wikidata.org/wiki/Special:ItemByTitle"
|
|
query = {}
|
|
query["site"] = "rowikisource"
|
|
query["page"] = authorLink.replace("/wiki/", "")
|
|
result = requests.get(QUERY_URL, params=query).text
|
|
q = PyQuery(result)
|
|
for link in q("link"):
|
|
# Skip bad links
|
|
if (not 'rel' in link.attrib) or (not 'type' in link.attrib):
|
|
continue
|
|
# Found what we are looking for?
|
|
if link.attrib['rel'] == 'alternate' and link.attrib['type'] == 'application/json':
|
|
jsonLocation = link.attrib['href']
|
|
|
|
# Found the JSON file? get it
|
|
if len(jsonLocation) > 0:
|
|
json = requests.get(jsonLocation).json()
|
|
entities = json['entities']
|
|
entities_list = list(entities.values())
|
|
return entities_list[0]
|
|
|
|
# Found nothing :(
|
|
print("Warning: couldn't find data for author ", authorLink, ", request status ", result)
|
|
return {}
|
|
|
|
|
|
def getAuthorInfo(authorLink):
|
|
author = {}
|
|
author["name"] = "todo"
|
|
author["wikidata"] = getAuthorWikidata(authorLink)
|
|
author["works"] = list(getAuthorWorks(authorLink))
|
|
print(author)
|
|
|
|
getAuthorInfo("/wiki/Autor:Nicolae_Iorga") |