diff options
author | Yuqian Yang <crupest@crupest.life> | 2025-02-28 23:13:39 +0800 |
---|---|---|
committer | Yuqian Yang <crupest@crupest.life> | 2025-02-28 23:13:39 +0800 |
commit | dc1f0c4c0096013799416664894c5194dc7e1f52 (patch) | |
tree | 2f5d235f778cd720f4c39ec3e56b77ba6d99f375 /store/works/life/gpcp/GpcpScrawler.py | |
parent | 7299d424d90b1effb6db69e3476ddd5af72eeba4 (diff) | |
download | crupest-dc1f0c4c0096013799416664894c5194dc7e1f52.tar.gz crupest-dc1f0c4c0096013799416664894c5194dc7e1f52.tar.bz2 crupest-dc1f0c4c0096013799416664894c5194dc7e1f52.zip |
chore(store): move everything to store.
Diffstat (limited to 'store/works/life/gpcp/GpcpScrawler.py')
-rw-r--r-- | store/works/life/gpcp/GpcpScrawler.py | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/store/works/life/gpcp/GpcpScrawler.py b/store/works/life/gpcp/GpcpScrawler.py new file mode 100644 index 0000000..7cf67ec --- /dev/null +++ b/store/works/life/gpcp/GpcpScrawler.py @@ -0,0 +1,38 @@ +import requests +from bs4 import BeautifulSoup +import threading + +base_url = "https://www.ncei.noaa.gov/data/global-precipitation-climatology-project-gpcp-daily/access/" + +start_year = 2011 + + +def get_with_retry(url): + while True: + try: + return requests.get(url, timeout=30) + except: + pass + + +def worker(year): + html_url = base_url + str(year) + html = get_with_retry(html_url).text + soup = BeautifulSoup(html, 'html.parser') + names = [a.attrs['href'] for a in soup.find_all('a')] + for name in names: + if name.startswith("gpcp"): + url = html_url + '/' + name + print("Downloading " + name + "...") + open(name, 'wb').write(get_with_retry(url).content) + + +threads = [] + +for year in range(start_year, start_year+10): + t = threading.Thread(target=worker, args=(year,)) + threads.append(t) + t.start() + +for t in threads: + t.join() |