Improved the scripts which acquire data from wikisources (and now wikidata).

2017-06-06 00:39:36 +03:00
parent 64caeab14e
commit 6102e946d7
3 changed files with 140 additions and 8 deletions
--- a/src/step0_acquire/wikisource/author_info.py
+++ b/src/step0_acquire/wikisource/author_info.py
@@ -0,0 +1,93 @@
 import urllib
 from pyquery import PyQuery
 import sqlite3
 import re
 import json
 import requests
 BASE_URL = "https://ro.wikisource.org"
 API_URL = BASE_URL + "/w/api.php"
 # https://ro.wikisource.org/w/api.php?action=query&titles=&prop=links|wbentityusage
 def wikisourceQuery(query):
    query["action"] = "query"
    query["format"] = "json"
    cont = {}
    # download page
    while True:
        query.update(cont)
        result = requests.get(API_URL, params=query).json()
        # handle errors and warnings
        if 'error' in result:
            print("Error: ", result['error'])
        if 'warnings' in result:
            print("Warnings: ", result['warnings'])
        # return query result
        if 'query' in result:
            yield result['query']
        # Handle 'continue'
        if 'continue' in result:
            cont = result['continue']
        else:
            break
 def getAuthorWorks(authorLink):
    authorLink = authorLink.replace("/wiki/", "")
    query = {}
    query["titles"] = authorLink
    query["prop"] = "links"
    for result in wikisourceQuery(query):
        for page in result['pages'].values():
            links = page["links"]
            for link in links:
                yield link["title"]
 def getAuthorWikidata(authorLink):
    jsonLocation = ""
    # Perform a query to find the correct page
    QUERY_URL = "https://www.wikidata.org/wiki/Special:ItemByTitle"
    query = {}
    query["site"] = "rowikisource"
    query["page"] = authorLink.replace("/wiki/", "")
    result = requests.get(QUERY_URL, params=query).text
    q = PyQuery(result)
    for link in q("link"):
        # Skip bad links
        if (not 'rel' in link.attrib) or (not 'type' in link.attrib):
            continue
        # Found what we are looking for?
        if link.attrib['rel'] == 'alternate' and link.attrib['type'] == 'application/json':
            jsonLocation = link.attrib['href']
    # Found the JSON file? get it
    if len(jsonLocation) > 0:
        json = requests.get(jsonLocation).json()
        entities = json['entities']
        entities_list = list(entities.values())
        return entities_list[0]
    # Found nothing :(
    print("Warning: couldn't find data for author ", authorLink, ", request status ", result)
    return {}
 def getAuthorInfo(authorLink):
    author = {}
    author["name"] = "todo"
    author["wikidata"] = getAuthorWikidata(authorLink)
    author["works"] = list(getAuthorWorks(authorLink))
    print(author)
 getAuthorInfo("/wiki/Autor:Nicolae_Iorga")
--- a/src/step0_acquire/wikisource/author_list.py
+++ b/src/step0_acquire/wikisource/author_list.py
@@ -0,0 +1,47 @@
 import urllib
 from pyquery import PyQuery
 BASE_URL = "https://ro.wikisource.org"
 BASE_INDEX = urllib.request.quote("/wiki/Categorie:Autori_în_ordine_alfabetică")
 def buildAuthorList():
    authors = []
    indexPages = [ BASE_INDEX ]
    processedPages = []
    while len(indexPages) > 0:
        currentIndexPage = indexPages.pop(0)
        # Avoid infinite loops by avoiding re-processing already processed pages
        if currentIndexPage in processedPages:
            continue
        processedPages.append(currentIndexPage)
        # Read page
        print("Index page: ", currentIndexPage)
        # currentIndexPage = urllib.request.quote(currentIndexPage)
        data = urllib.request.urlopen(BASE_URL + currentIndexPage).read()
        q = PyQuery(data)
        for link in q("a"):
            if 'href' in link.attrib:
                linkaddr = link.attrib['href']
                if "index.php" in linkaddr:
                    continue
                if "wiki/Autor:" in linkaddr:
                    authors.append(linkaddr)
                    print("Autor: ", linkaddr)
                if "wiki/Categorie:Autori-" in linkaddr:
                    indexPages.append(linkaddr)
    # remove duplicates
    authors = list(set(authors))
    return authors
 authors = buildAuthorList()
 print(authors)
--- a/src/tools/test.php
+++ b/src/tools/test.php
@@ -1,8 +0,0 @@
 <!DOCTYPE html>
 <html>
 <body>
 <?php echo "Hello world!"?>
 </body>
 </html>