diff --git a/src/step0_acquire/wikisource/author_info.py b/src/step0_acquire/wikisource/author_info.py new file mode 100644 index 0000000..3cfec64 --- /dev/null +++ b/src/step0_acquire/wikisource/author_info.py @@ -0,0 +1,93 @@ +import urllib +from pyquery import PyQuery +import sqlite3 +import re +import json +import requests + +BASE_URL = "https://ro.wikisource.org" +API_URL = BASE_URL + "/w/api.php" +# https://ro.wikisource.org/w/api.php?action=query&titles=&prop=links|wbentityusage + +def wikisourceQuery(query): + query["action"] = "query" + query["format"] = "json" + + cont = {} + + # download page + while True: + query.update(cont) + result = requests.get(API_URL, params=query).json() + + # handle errors and warnings + if 'error' in result: + print("Error: ", result['error']) + + if 'warnings' in result: + print("Warnings: ", result['warnings']) + + # return query result + if 'query' in result: + yield result['query'] + + # Handle 'continue' + if 'continue' in result: + cont = result['continue'] + else: + break + + +def getAuthorWorks(authorLink): + authorLink = authorLink.replace("/wiki/", "") + + query = {} + query["titles"] = authorLink + query["prop"] = "links" + + for result in wikisourceQuery(query): + for page in result['pages'].values(): + links = page["links"] + for link in links: + yield link["title"] + + +def getAuthorWikidata(authorLink): + + jsonLocation = "" + + # Perform a query to find the correct page + QUERY_URL = "https://www.wikidata.org/wiki/Special:ItemByTitle" + query = {} + query["site"] = "rowikisource" + query["page"] = authorLink.replace("/wiki/", "") + result = requests.get(QUERY_URL, params=query).text + q = PyQuery(result) + for link in q("link"): + # Skip bad links + if (not 'rel' in link.attrib) or (not 'type' in link.attrib): + continue + # Found what we are looking for? + if link.attrib['rel'] == 'alternate' and link.attrib['type'] == 'application/json': + jsonLocation = link.attrib['href'] + + # Found the JSON file? get it + if len(jsonLocation) > 0: + json = requests.get(jsonLocation).json() + entities = json['entities'] + entities_list = list(entities.values()) + return entities_list[0] + + # Found nothing :( + print("Warning: couldn't find data for author ", authorLink, ", request status ", result) + return {} + + +def getAuthorInfo(authorLink): + author = {} + author["name"] = "todo" + author["wikidata"] = getAuthorWikidata(authorLink) + author["works"] = list(getAuthorWorks(authorLink)) + print(author) + +getAuthorInfo("/wiki/Autor:Nicolae_Iorga") \ No newline at end of file diff --git a/src/step0_acquire/wikisource/author_list.py b/src/step0_acquire/wikisource/author_list.py new file mode 100644 index 0000000..f2548c2 --- /dev/null +++ b/src/step0_acquire/wikisource/author_list.py @@ -0,0 +1,47 @@ +import urllib +from pyquery import PyQuery + +BASE_URL = "https://ro.wikisource.org" +BASE_INDEX = urllib.request.quote("/wiki/Categorie:Autori_în_ordine_alfabetică") + +def buildAuthorList(): + authors = [] + indexPages = [ BASE_INDEX ] + processedPages = [] + + while len(indexPages) > 0: + + currentIndexPage = indexPages.pop(0) + + # Avoid infinite loops by avoiding re-processing already processed pages + if currentIndexPage in processedPages: + continue + + processedPages.append(currentIndexPage) + + # Read page + print("Index page: ", currentIndexPage) + # currentIndexPage = urllib.request.quote(currentIndexPage) + data = urllib.request.urlopen(BASE_URL + currentIndexPage).read() + q = PyQuery(data) + + for link in q("a"): + if 'href' in link.attrib: + linkaddr = link.attrib['href'] + + if "index.php" in linkaddr: + continue + + if "wiki/Autor:" in linkaddr: + authors.append(linkaddr) + print("Autor: ", linkaddr) + + if "wiki/Categorie:Autori-" in linkaddr: + indexPages.append(linkaddr) + + # remove duplicates + authors = list(set(authors)) + return authors + +authors = buildAuthorList() +print(authors) \ No newline at end of file diff --git a/src/tools/test.php b/src/tools/test.php deleted file mode 100644 index 18fbef1..0000000 --- a/src/tools/test.php +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file