Improved the scripts which acquire data from wikisources (and now wikidata).
This commit is contained in:
parent
64caeab14e
commit
6102e946d7
93
src/step0_acquire/wikisource/author_info.py
Normal file
93
src/step0_acquire/wikisource/author_info.py
Normal file
@ -0,0 +1,93 @@
|
||||
import urllib
|
||||
from pyquery import PyQuery
|
||||
import sqlite3
|
||||
import re
|
||||
import json
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://ro.wikisource.org"
|
||||
API_URL = BASE_URL + "/w/api.php"
|
||||
# https://ro.wikisource.org/w/api.php?action=query&titles=&prop=links|wbentityusage
|
||||
|
||||
def wikisourceQuery(query):
|
||||
query["action"] = "query"
|
||||
query["format"] = "json"
|
||||
|
||||
cont = {}
|
||||
|
||||
# download page
|
||||
while True:
|
||||
query.update(cont)
|
||||
result = requests.get(API_URL, params=query).json()
|
||||
|
||||
# handle errors and warnings
|
||||
if 'error' in result:
|
||||
print("Error: ", result['error'])
|
||||
|
||||
if 'warnings' in result:
|
||||
print("Warnings: ", result['warnings'])
|
||||
|
||||
# return query result
|
||||
if 'query' in result:
|
||||
yield result['query']
|
||||
|
||||
# Handle 'continue'
|
||||
if 'continue' in result:
|
||||
cont = result['continue']
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
def getAuthorWorks(authorLink):
|
||||
authorLink = authorLink.replace("/wiki/", "")
|
||||
|
||||
query = {}
|
||||
query["titles"] = authorLink
|
||||
query["prop"] = "links"
|
||||
|
||||
for result in wikisourceQuery(query):
|
||||
for page in result['pages'].values():
|
||||
links = page["links"]
|
||||
for link in links:
|
||||
yield link["title"]
|
||||
|
||||
|
||||
def getAuthorWikidata(authorLink):
|
||||
|
||||
jsonLocation = ""
|
||||
|
||||
# Perform a query to find the correct page
|
||||
QUERY_URL = "https://www.wikidata.org/wiki/Special:ItemByTitle"
|
||||
query = {}
|
||||
query["site"] = "rowikisource"
|
||||
query["page"] = authorLink.replace("/wiki/", "")
|
||||
result = requests.get(QUERY_URL, params=query).text
|
||||
q = PyQuery(result)
|
||||
for link in q("link"):
|
||||
# Skip bad links
|
||||
if (not 'rel' in link.attrib) or (not 'type' in link.attrib):
|
||||
continue
|
||||
# Found what we are looking for?
|
||||
if link.attrib['rel'] == 'alternate' and link.attrib['type'] == 'application/json':
|
||||
jsonLocation = link.attrib['href']
|
||||
|
||||
# Found the JSON file? get it
|
||||
if len(jsonLocation) > 0:
|
||||
json = requests.get(jsonLocation).json()
|
||||
entities = json['entities']
|
||||
entities_list = list(entities.values())
|
||||
return entities_list[0]
|
||||
|
||||
# Found nothing :(
|
||||
print("Warning: couldn't find data for author ", authorLink, ", request status ", result)
|
||||
return {}
|
||||
|
||||
|
||||
def getAuthorInfo(authorLink):
|
||||
author = {}
|
||||
author["name"] = "todo"
|
||||
author["wikidata"] = getAuthorWikidata(authorLink)
|
||||
author["works"] = list(getAuthorWorks(authorLink))
|
||||
print(author)
|
||||
|
||||
getAuthorInfo("/wiki/Autor:Nicolae_Iorga")
|
47
src/step0_acquire/wikisource/author_list.py
Normal file
47
src/step0_acquire/wikisource/author_list.py
Normal file
@ -0,0 +1,47 @@
|
||||
import urllib
|
||||
from pyquery import PyQuery
|
||||
|
||||
BASE_URL = "https://ro.wikisource.org"
|
||||
BASE_INDEX = urllib.request.quote("/wiki/Categorie:Autori_în_ordine_alfabetică")
|
||||
|
||||
def buildAuthorList():
|
||||
authors = []
|
||||
indexPages = [ BASE_INDEX ]
|
||||
processedPages = []
|
||||
|
||||
while len(indexPages) > 0:
|
||||
|
||||
currentIndexPage = indexPages.pop(0)
|
||||
|
||||
# Avoid infinite loops by avoiding re-processing already processed pages
|
||||
if currentIndexPage in processedPages:
|
||||
continue
|
||||
|
||||
processedPages.append(currentIndexPage)
|
||||
|
||||
# Read page
|
||||
print("Index page: ", currentIndexPage)
|
||||
# currentIndexPage = urllib.request.quote(currentIndexPage)
|
||||
data = urllib.request.urlopen(BASE_URL + currentIndexPage).read()
|
||||
q = PyQuery(data)
|
||||
|
||||
for link in q("a"):
|
||||
if 'href' in link.attrib:
|
||||
linkaddr = link.attrib['href']
|
||||
|
||||
if "index.php" in linkaddr:
|
||||
continue
|
||||
|
||||
if "wiki/Autor:" in linkaddr:
|
||||
authors.append(linkaddr)
|
||||
print("Autor: ", linkaddr)
|
||||
|
||||
if "wiki/Categorie:Autori-" in linkaddr:
|
||||
indexPages.append(linkaddr)
|
||||
|
||||
# remove duplicates
|
||||
authors = list(set(authors))
|
||||
return authors
|
||||
|
||||
authors = buildAuthorList()
|
||||
print(authors)
|
@ -1,8 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
|
||||
<?php echo "Hello world!"?>
|
||||
|
||||
</body>
|
||||
</html>
|
Loading…
x
Reference in New Issue
Block a user