--- a/data/script/scrap_pyavm_spitzer.py Mon Jul 30 14:00:40 2018 +0200
+++ b/data/script/scrap_pyavm_spitzer.py Mon Jul 30 14:21:28 2018 +0200
@@ -1,5 +1,6 @@
import json
import os
+import os.path
from urllib import request
import requests
@@ -29,12 +30,17 @@
for page in range (1, 10):
- url = "http://www.spitzer.caltech.edu/search/image_set/20?resource=image_set&tabs=hidden&page={}".format(page)
+ url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
website = requests.get(url)
soup = BeautifulSoup(website.content, "html5lib")
table = soup.find("table", {"class":"items"})
itemtds = table.find_all("td", {"class":"item"})
+
+ if len(itemtds) == 0:
+ print("No item in page, exit.")
+ break
+
for td in itemtds:
# itemlinks = find("a", href=True)
# print(td.div.a.get('href'))
@@ -57,6 +63,9 @@
os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
+ if os.path.isfile(img_path):
+ print("--> file %s exists from url %s" % (img_path,img_url))
+ continue
request.urlretrieve(img_url, img_path)
avm = AVM.from_image(img_path)
img_data = convert_avm_container(avm)
@@ -78,14 +87,9 @@
with open(json_path, 'w') as outfile:
json.dump(img_json, outfile)
- break
-
-# avm = AVM.from_image('eso1238a.jpg')
-# print(avm)
-
'''
MetadataVersion: b'1.2'