aboutsummaryrefslogtreecommitdiff
path: root/works/life/gpcp/GpcpScrawler.py
diff options
context:
space:
mode:
Diffstat (limited to 'works/life/gpcp/GpcpScrawler.py')
-rw-r--r--works/life/gpcp/GpcpScrawler.py38
1 files changed, 0 insertions, 38 deletions
diff --git a/works/life/gpcp/GpcpScrawler.py b/works/life/gpcp/GpcpScrawler.py
deleted file mode 100644
index 7cf67ec..0000000
--- a/works/life/gpcp/GpcpScrawler.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-import threading
-
-base_url = "https://www.ncei.noaa.gov/data/global-precipitation-climatology-project-gpcp-daily/access/"
-
-start_year = 2011
-
-
-def get_with_retry(url):
- while True:
- try:
- return requests.get(url, timeout=30)
- except:
- pass
-
-
-def worker(year):
- html_url = base_url + str(year)
- html = get_with_retry(html_url).text
- soup = BeautifulSoup(html, 'html.parser')
- names = [a.attrs['href'] for a in soup.find_all('a')]
- for name in names:
- if name.startswith("gpcp"):
- url = html_url + '/' + name
- print("Downloading " + name + "...")
- open(name, 'wb').write(get_with_retry(url).content)
-
-
-threads = []
-
-for year in range(start_year, start_year+10):
- t = threading.Thread(target=worker, args=(year,))
- threads.append(t)
- t.start()
-
-for t in threads:
- t.join()