Improved the scripts which acquire data from wikisources (and now wikidata).

This commit is contained in:
Tiberiu Chibici 2017-06-06 00:39:36 +03:00
parent 64caeab14e
commit 6102e946d7
3 changed files with 140 additions and 8 deletions

View File

@ -0,0 +1,93 @@
import urllib
from pyquery import PyQuery
import sqlite3
import re
import json
import requests
BASE_URL = "https://ro.wikisource.org"
API_URL = BASE_URL + "/w/api.php"
# https://ro.wikisource.org/w/api.php?action=query&titles=&prop=links|wbentityusage
def wikisourceQuery(query):
query["action"] = "query"
query["format"] = "json"
cont = {}
# download page
while True:
query.update(cont)
result = requests.get(API_URL, params=query).json()
# handle errors and warnings
if 'error' in result:
print("Error: ", result['error'])
if 'warnings' in result:
print("Warnings: ", result['warnings'])
# return query result
if 'query' in result:
yield result['query']
# Handle 'continue'
if 'continue' in result:
cont = result['continue']
else:
break
def getAuthorWorks(authorLink):
authorLink = authorLink.replace("/wiki/", "")
query = {}
query["titles"] = authorLink
query["prop"] = "links"
for result in wikisourceQuery(query):
for page in result['pages'].values():
links = page["links"]
for link in links:
yield link["title"]
def getAuthorWikidata(authorLink):
jsonLocation = ""
# Perform a query to find the correct page
QUERY_URL = "https://www.wikidata.org/wiki/Special:ItemByTitle"
query = {}
query["site"] = "rowikisource"
query["page"] = authorLink.replace("/wiki/", "")
result = requests.get(QUERY_URL, params=query).text
q = PyQuery(result)
for link in q("link"):
# Skip bad links
if (not 'rel' in link.attrib) or (not 'type' in link.attrib):
continue
# Found what we are looking for?
if link.attrib['rel'] == 'alternate' and link.attrib['type'] == 'application/json':
jsonLocation = link.attrib['href']
# Found the JSON file? get it
if len(jsonLocation) > 0:
json = requests.get(jsonLocation).json()
entities = json['entities']
entities_list = list(entities.values())
return entities_list[0]
# Found nothing :(
print("Warning: couldn't find data for author ", authorLink, ", request status ", result)
return {}
def getAuthorInfo(authorLink):
author = {}
author["name"] = "todo"
author["wikidata"] = getAuthorWikidata(authorLink)
author["works"] = list(getAuthorWorks(authorLink))
print(author)
getAuthorInfo("/wiki/Autor:Nicolae_Iorga")

View File

@ -0,0 +1,47 @@
import urllib
from pyquery import PyQuery
BASE_URL = "https://ro.wikisource.org"
BASE_INDEX = urllib.request.quote("/wiki/Categorie:Autori_în_ordine_alfabetică")
def buildAuthorList():
authors = []
indexPages = [ BASE_INDEX ]
processedPages = []
while len(indexPages) > 0:
currentIndexPage = indexPages.pop(0)
# Avoid infinite loops by avoiding re-processing already processed pages
if currentIndexPage in processedPages:
continue
processedPages.append(currentIndexPage)
# Read page
print("Index page: ", currentIndexPage)
# currentIndexPage = urllib.request.quote(currentIndexPage)
data = urllib.request.urlopen(BASE_URL + currentIndexPage).read()
q = PyQuery(data)
for link in q("a"):
if 'href' in link.attrib:
linkaddr = link.attrib['href']
if "index.php" in linkaddr:
continue
if "wiki/Autor:" in linkaddr:
authors.append(linkaddr)
print("Autor: ", linkaddr)
if "wiki/Categorie:Autori-" in linkaddr:
indexPages.append(linkaddr)
# remove duplicates
authors = list(set(authors))
return authors
authors = buildAuthorList()
print(authors)

View File

@ -1,8 +0,0 @@
<!DOCTYPE html>
<html>
<body>
<?php echo "Hello world!"?>
</body>
</html>