equal
deleted
inserted
replaced
1 import json |
1 import json |
2 import os |
2 import os |
|
3 import os.path |
3 from urllib import request |
4 from urllib import request |
4 |
5 |
5 import requests |
6 import requests |
6 |
7 |
7 from bs4 import BeautifulSoup |
8 from bs4 import BeautifulSoup |
27 # res[k] = v |
28 # res[k] = v |
28 # return res |
29 # return res |
29 |
30 |
30 |
31 |
31 for page in range (1, 10): |
32 for page in range (1, 10): |
32 url = "http://www.spitzer.caltech.edu/search/image_set/20?resource=image_set&tabs=hidden&page={}".format(page) |
33 url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page) |
33 website = requests.get(url) |
34 website = requests.get(url) |
34 soup = BeautifulSoup(website.content, "html5lib") |
35 soup = BeautifulSoup(website.content, "html5lib") |
35 |
36 |
36 table = soup.find("table", {"class":"items"}) |
37 table = soup.find("table", {"class":"items"}) |
37 itemtds = table.find_all("td", {"class":"item"}) |
38 itemtds = table.find_all("td", {"class":"item"}) |
|
39 |
|
40 if len(itemtds) == 0: |
|
41 print("No item in page, exit.") |
|
42 break |
|
43 |
38 for td in itemtds: |
44 for td in itemtds: |
39 # itemlinks = find("a", href=True) |
45 # itemlinks = find("a", href=True) |
40 # print(td.div.a.get('href')) |
46 # print(td.div.a.get('href')) |
41 detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href') |
47 detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href') |
42 detail_content = requests.get(detail_url).content |
48 detail_content = requests.get(detail_url).content |
55 continue |
61 continue |
56 if not os.path.isdir('scrapSpitzer/' + img_id): |
62 if not os.path.isdir('scrapSpitzer/' + img_id): |
57 os.makedirs('scrapSpitzer/' + img_id, exist_ok=True) |
63 os.makedirs('scrapSpitzer/' + img_id, exist_ok=True) |
58 img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id) |
64 img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id) |
59 json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id) |
65 json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id) |
|
66 if os.path.isfile(img_path): |
|
67 print("--> file %s exists from url %s" % (img_path,img_url)) |
|
68 continue |
60 request.urlretrieve(img_url, img_path) |
69 request.urlretrieve(img_url, img_path) |
61 avm = AVM.from_image(img_path) |
70 avm = AVM.from_image(img_path) |
62 img_data = convert_avm_container(avm) |
71 img_data = convert_avm_container(avm) |
63 |
72 |
64 img_json = { |
73 img_json = { |
76 'url': img_data.get('ReferenceURL') |
85 'url': img_data.get('ReferenceURL') |
77 } |
86 } |
78 |
87 |
79 with open(json_path, 'w') as outfile: |
88 with open(json_path, 'w') as outfile: |
80 json.dump(img_json, outfile) |
89 json.dump(img_json, outfile) |
81 break |
|
82 |
90 |
83 |
91 |
84 |
|
85 |
|
86 # avm = AVM.from_image('eso1238a.jpg') |
|
87 # print(avm) |
|
88 |
92 |
89 ''' |
93 ''' |
90 |
94 |
91 MetadataVersion: b'1.2' |
95 MetadataVersion: b'1.2' |
92 Creator: b'Spitzer Space Telescope' |
96 Creator: b'Spitzer Space Telescope' |