aboutsummaryrefslogtreecommitdiff
path: root/works/life/gpcp/GpcpScrawler.py
blob: 7cf67ecc8280eab0fd6b0749051ec2e237d38e98 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import requests
from bs4 import BeautifulSoup
import threading

base_url = "https://www.ncei.noaa.gov/data/global-precipitation-climatology-project-gpcp-daily/access/"

start_year = 2011


def get_with_retry(url):
    while True:
        try:
            return requests.get(url, timeout=30)
        except:
            pass


def worker(year):
    html_url = base_url + str(year)
    html = get_with_retry(html_url).text
    soup = BeautifulSoup(html, 'html.parser')
    names = [a.attrs['href'] for a in soup.find_all('a')]
    for name in names:
        if name.startswith("gpcp"):
            url = html_url + '/' + name
            print("Downloading " + name + "...")
            open(name, 'wb').write(get_with_retry(url).content)


threads = []

for year in range(start_year, start_year+10):
    t = threading.Thread(target=worker, args=(year,))
    threads.append(t)
    t.start()

for t in threads:
    t.join()