Commit a7f14da7 authored by Bronger, Torsten's avatar Bronger, Torsten
Browse files

Add rudimentary crawler.py

parent 779b5d78
#!/bin/python
import re, datetime
import trio, httpx
limit = trio.CapacityLimiter(100)
base_juser_url = "https://juser.fz-juelich.de/search?cc=VDB&p=pub%3A{year}+and+0247_a%3A%2F%5E10%5C.%2F"
dois = set()
broken_dois = []
def normalize_doi(doi):
"""May return None if DOI is invalid.
"""
result = doi.strip().lower()
if not result:
broken_dois.append(result)
return None
return result
async def fetch_200_dois(start, year):
async with limit:
response = await httpx.AsyncClient().get(base_juser_url.format(year=year) + f"&rg=200&jrec={start}&of=doi")
chunk_dois = [normalize_doi(doi) for doi in response.read().decode().splitlines()]
dois.update(doi for doi in chunk_dois if doi)
async def crawl_year(year):
response = httpx.get(base_juser_url.format(year=year))
number_publications = int(re.search(r": <strong>([^<]+)", response.read().decode()).group(1).replace(",", ""))
async with trio.open_nursery() as nursery:
for start in range(1, number_publications + 1, 200):
nursery.start_soon(fetch_200_dois, start, year)
async def crawl():
current_year = int(datetime.date.today().strftime("%Y"))
async with trio.open_nursery() as nursery:
for year in range(current_year - 2, current_year + 1):
nursery.start_soon(crawl_year, year)
trio.run(crawl)
print(len(dois))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment