data/script/scrap_pyavm_spitzer.py
changeset 25 8690bf2fb09a
parent 24 0c9c840b82dc
child 26 957d03d2bc26
--- a/data/script/scrap_pyavm_spitzer.py	Mon Jul 30 14:00:40 2018 +0200
+++ b/data/script/scrap_pyavm_spitzer.py	Mon Jul 30 14:21:28 2018 +0200
@@ -1,5 +1,6 @@
 import json
 import os
+import os.path
 from urllib import request
 
 import requests
@@ -29,12 +30,17 @@
 
 
 for page in range (1, 10):
-    url = "http://www.spitzer.caltech.edu/search/image_set/20?resource=image_set&tabs=hidden&page={}".format(page)
+    url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
     website = requests.get(url)
     soup = BeautifulSoup(website.content, "html5lib")
 
     table = soup.find("table", {"class":"items"})
     itemtds = table.find_all("td", {"class":"item"})
+
+    if len(itemtds) == 0:
+        print("No item in page, exit.")
+        break
+
     for td in itemtds:
     # itemlinks = find("a", href=True)
         # print(td.div.a.get('href'))
@@ -57,6 +63,9 @@
             os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
         img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
         json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
+        if os.path.isfile(img_path):
+            print("--> file %s exists from url %s" % (img_path,img_url))
+            continue
         request.urlretrieve(img_url, img_path)
         avm = AVM.from_image(img_path)
         img_data = convert_avm_container(avm)
@@ -78,14 +87,9 @@
 
         with open(json_path, 'w') as outfile:
             json.dump(img_json, outfile)
-    break
-
 
 
 
-# avm = AVM.from_image('eso1238a.jpg')
-# print(avm)
-
 '''
 
 MetadataVersion: b'1.2'