Commit a7f14da7 authored by Bronger, Torsten's avatar Bronger, Torsten
Browse files

Add rudimentary

parent 779b5d78
import re, datetime
import trio, httpx
limit = trio.CapacityLimiter(100)
base_juser_url = "{year}+and+0247_a%3A%2F%5E10%5C.%2F"
dois = set()
broken_dois = []
def normalize_doi(doi):
"""May return None if DOI is invalid.
result = doi.strip().lower()
if not result:
return None
return result
async def fetch_200_dois(start, year):
async with limit:
response = await httpx.AsyncClient().get(base_juser_url.format(year=year) + f"&rg=200&jrec={start}&of=doi")
chunk_dois = [normalize_doi(doi) for doi in]
dois.update(doi for doi in chunk_dois if doi)
async def crawl_year(year):
response = httpx.get(base_juser_url.format(year=year))
number_publications = int(": <strong>([^<]+)",",", ""))
async with trio.open_nursery() as nursery:
for start in range(1, number_publications + 1, 200):
nursery.start_soon(fetch_200_dois, start, year)
async def crawl():
current_year = int("%Y"))
async with trio.open_nursery() as nursery:
for year in range(current_year - 2, current_year + 1):
nursery.start_soon(crawl_year, year)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment