diff -r 0c9c840b82dc -r 8690bf2fb09a data/script/scrap_pyavm_spitzer.py --- a/data/script/scrap_pyavm_spitzer.py Mon Jul 30 14:00:40 2018 +0200 +++ b/data/script/scrap_pyavm_spitzer.py Mon Jul 30 14:21:28 2018 +0200 @@ -1,5 +1,6 @@ import json import os +import os.path from urllib import request import requests @@ -29,12 +30,17 @@ for page in range (1, 10): - url = "http://www.spitzer.caltech.edu/search/image_set/20?resource=image_set&tabs=hidden&page={}".format(page) + url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page) website = requests.get(url) soup = BeautifulSoup(website.content, "html5lib") table = soup.find("table", {"class":"items"}) itemtds = table.find_all("td", {"class":"item"}) + + if len(itemtds) == 0: + print("No item in page, exit.") + break + for td in itemtds: # itemlinks = find("a", href=True) # print(td.div.a.get('href')) @@ -57,6 +63,9 @@ os.makedirs('scrapSpitzer/' + img_id, exist_ok=True) img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id) json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id) + if os.path.isfile(img_path): + print("--> file %s exists from url %s" % (img_path,img_url)) + continue request.urlretrieve(img_url, img_path) avm = AVM.from_image(img_path) img_data = convert_avm_container(avm) @@ -78,14 +87,9 @@ with open(json_path, 'w') as outfile: json.dump(img_json, outfile) - break - -# avm = AVM.from_image('eso1238a.jpg') -# print(avm) - ''' MetadataVersion: b'1.2'