iconolab-episteme: comparison data/script/scrap_pyavm

equal deleted inserted replaced

-:0c9c840b82dc
+:8690bf2fb09a
 import json
 import os
+import os.path
 from urllib import request
 import requests
 from bs4 import BeautifulSoup
 #         res[k] = v
 # return res
 for page in range (1, 10):
-url = "http://www.spitzer.caltech.edu/search/image_set/20?resource=image_set&tabs=hidden&page={}".format(page)
+url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
 website = requests.get(url)
 soup = BeautifulSoup(website.content, "html5lib")
 table = soup.find("table", {"class":"items"})
 itemtds = table.find_all("td", {"class":"item"})
+if len(itemtds) == 0:
+print("No item in page, exit.")
+break
 for td in itemtds:
 # itemlinks = find("a", href=True)
 # print(td.div.a.get('href'))
 detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href')
 detail_content = requests.get(detail_url).content
 continue
 if not os.path.isdir('scrapSpitzer/' + img_id):
 os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
 img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
 json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
+if os.path.isfile(img_path):
+print("--> file %s exists from url %s" % (img_path,img_url))
+continue
 request.urlretrieve(img_url, img_path)
 avm = AVM.from_image(img_path)
 img_data = convert_avm_container(avm)
 img_json = {
 'url': img_data.get('ReferenceURL')
 }
 with open(json_path, 'w') as outfile:
 json.dump(img_json, outfile)
-break
-# avm = AVM.from_image('eso1238a.jpg')
-# print(avm)
 '''
 MetadataVersion: b'1.2'
 Creator: b'Spitzer Space Telescope'

changeset 25	8690bf2fb09a
parent 24	0c9c840b82dc
child 26	957d03d2bc26