41 print("No item in page, exit.") |
53 print("No item in page, exit.") |
42 break |
54 break |
43 |
55 |
44 for td in itemtds: |
56 for td in itemtds: |
45 # itemlinks = find("a", href=True) |
57 # itemlinks = find("a", href=True) |
46 # print(td.div.a.get('href')) |
58 img_href = td.div.a.get('href') |
47 detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href') |
59 print("IMG HREF %s" % img_href) |
48 detail_content = requests.get(detail_url).content |
60 img_slug = img_href.split("/")[-1] |
49 detail_soup = BeautifulSoup(detail_content, "html5lib") |
61 img_dir_path = 'scrapSpitzer/' + img_slug |
50 img_id_elem = detail_soup.find("dd", {"property":"avm:ID"}) |
62 if os.path.isdir(img_dir_path) and glob.glob("%s/*.jpg"%img_dir_path) and glob.glob("%s/*.json"%img_dir_path): |
51 img_id = img_id_elem.getText() |
63 print("--> img %s exists" % (img_slug,)) |
52 img_url = None |
|
53 img_box = detail_soup.find("div", {"class":"sidebar-section download"}) |
|
54 for img_link in img_box.find_all("a"): |
|
55 img_link_href = img_link.get("href") |
|
56 if img_link_href.endswith(img_id + ".jpg"): |
|
57 img_url = "http://www.spitzer.caltech.edu" + img_link_href |
|
58 break |
|
59 print(img_url) |
|
60 if not img_url: |
|
61 continue |
64 continue |
62 if not os.path.isdir('scrapSpitzer/' + img_id): |
65 else: |
63 os.makedirs('scrapSpitzer/' + img_id, exist_ok=True) |
66 os.makedirs(img_dir_path, exist_ok=True) |
64 img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id) |
|
65 json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id) |
|
66 if os.path.isfile(img_path): |
|
67 print("--> file %s exists from url %s" % (img_path,img_url)) |
|
68 continue |
|
69 request.urlretrieve(img_url, img_path) |
|
70 avm = AVM.from_image(img_path) |
|
71 img_data = convert_avm_container(avm) |
|
72 |
67 |
73 img_json = { |
68 detail_url = "http://www.spitzer.caltech.edu" + img_href |
74 'avm': img_data, |
|
75 'image': { |
|
76 'id': img_data.get('ID'), |
|
77 'title': img_data.get('Title'), |
|
78 'description_text': img_data.get('Description'), |
|
79 'description_html': '<div>' + img_data.get('Description') + "</div>", |
|
80 'date_publication' : img_data.get('Date'), |
|
81 'credit': img_data.get('Credit') or 'Courtesy NASA/JPL-Caltech', |
|
82 'type': img_data.get('Type'), |
|
83 'imgurl': img_data.get('ResourceURL') |
|
84 }, |
|
85 'url': img_data.get('ReferenceURL') |
|
86 } |
|
87 |
69 |
88 with open(json_path, 'w') as outfile: |
70 try: |
89 json.dump(img_json, outfile) |
71 detail_content = requests.get(detail_url).content |
|
72 detail_soup = BeautifulSoup(detail_content, "html5lib") |
|
73 img_id_elem = detail_soup.find("dd", {"property":"avm:ID"}) |
|
74 img_id = img_id_elem.getText().strip() if img_id_elem else None |
|
75 if not img_id: |
|
76 print("--> img %s has no id, skipping" % img_href) |
|
77 continue |
|
78 img_url = None |
|
79 img_box = detail_soup.find("div", {"class":"sidebar-section download"}) |
|
80 |
|
81 img_link_href_array = [ |
|
82 (img_link.get("href"), parse_img_link_size(img_link.getText())) |
|
83 for img_link in img_box.find_all("a") ] |
|
84 |
|
85 for img_link_href in img_link_href_array: |
|
86 if img_link_href[0].endswith(img_id + ".jpg"): |
|
87 img_url = "http://www.spitzer.caltech.edu" + img_link_href[0] |
|
88 break |
|
89 |
|
90 if not img_url: |
|
91 jpeg_img_link_href_array = sorted(filter(lambda l: l[0].endswith(".jpg"), img_link_href_array), key=lambda e: e[1], reverse=True) |
|
92 if jpeg_img_link_href_array: |
|
93 img_url = "http://www.spitzer.caltech.edu" + jpeg_img_link_href_array[0][0] |
90 |
94 |
91 |
95 |
|
96 if not img_url: |
|
97 tiff_img_link_href_array = sorted(filter(lambda l: l[0].endswith(".tif"), img_link_href_array), key=lambda e: e[1], reverse=True) |
|
98 if tiff_img_link_href_array: |
|
99 img_url = "http://www.spitzer.caltech.edu" + tiff_img_link_href_array[0][0] |
92 |
100 |
93 ''' |
|
94 |
101 |
95 MetadataVersion: b'1.2' |
102 if not img_url: |
96 Creator: b'Spitzer Space Telescope' |
103 print("NO IMG URL for %s : %r" % (img_id, img_link_href_array)) |
97 Title: b'Surface of TRAPPIST-1f' |
104 continue |
98 Headline: b"Imagine standing on the surface of the exoplanet TRAPPIST-1f. This artist's concept is one interpretation of what it could look like. " |
105 print("IMG URL %s" % img_url) |
99 Description: b"This artist's concept allows us to imagine what it would be like to stand on the surface of the exoplanet TRAPPIST-1f, located in the TRAPPIST-1 system in the constellation Aquarius. \n\nBecause this planet is thought to be tidally locked to its star, meaning the same face of the planet is always pointed at the star, there would be a region called the terminator that perpetually divides day and night. If the night side is icy, the day side might give way to liquid water in the area where sufficient starlight hits the surface. \n\nOne of the unusual features of TRAPPIST-1 planets is how close they are to each other -- so close that other planets could be visible in the sky from the surface of each one. In this view, the planets in the sky correspond to TRAPPIST1e (top left crescent), d (middle crescent) and c (bright dot to the lower right of the crescents). TRAPPIST-1e would appear about the same size as the moon and TRAPPIST1-c is on the far side of the star. The star itself, an ultra-cool dwarf, would appear about three times larger than our own sun does in Earth's skies.\n\nThe TRAPPIST-1 system has been revealed through observations from NASA's Spitzer Space Telescope and the ground-based TRAPPIST (TRAnsiting Planets and PlanetesImals Small Telescope) telescope, as well as other ground-based observatories. The system was named for the TRAPPIST telescope.\n\nNASA's Jet Propulsion Laboratory, Pasadena, California, manages the Spitzer Space Telescope mission for NASA's Science Mission Directorate, Washington. Science operations are conducted at the Spitzer Science Center at Caltech in Pasadena. Spacecraft operations are based at Lockheed Martin Space Systems Company, Littleton, Colorado. Data are archived at the Infrared Science Archive housed at Caltech/IPAC. Caltech manages JPL for NASA. " |
106 p = urlparse.urlparse(img_url).path |
100 ReferenceURL: b'http://www.spitzer.caltech.edu/images/6274-ssc2017-01c-Surface-of-TRAPPIST-1f' |
107 img_ext = os.path.splitext(p)[1] |
101 Credit: b'NASA/JPL-Caltech/T. Pyle (IPAC)' |
108 orig_img_path = 'scrapSpitzer/{0}/{1}{2}'.format(img_slug, img_id, img_ext) |
102 Date: b'2017-02-22' |
109 img_path = 'scrapSpitzer/{0}/{1}.jpg'.format(img_slug, img_id) |
103 ID: b'ssc2017-01c' |
110 json_path = 'scrapSpitzer/{0}/{1}.json'.format(img_slug, img_id) |
104 Type: b'Artwork' |
111 if os.path.isfile(img_path) and os.path.isfile(json_path): |
105 Publisher: b'Spitzer Science Center' |
112 print("--> file %s exists from url %s" % (img_path,img_url)) |
106 PublisherID: b'spitzer' |
113 continue |
107 ResourceID: b'ssc2017-01c.jpg' |
114 |
108 ResourceURL: b'http://www.spitzer.caltech.edu/uploaded_files/images/0010/9932/ssc201 |
115 if not os.path.isfile(orig_img_path): |
109 ''' |
116 request.urlretrieve(img_url, orig_img_path) |
|
117 |
|
118 if img_ext != "jpg" and not os.path.isfile(img_path) : |
|
119 im = Image.open(orig_img_path) |
|
120 rgb_im = im.convert('RGB') |
|
121 rgb_im.save(img_path) |
|
122 |
|
123 avm = AVM.from_image(orig_img_path) |
|
124 img_data = convert_avm_container(avm) |
|
125 description = img_data.get('Description') or "" |
|
126 |
|
127 img_json = { |
|
128 'avm': img_data, |
|
129 'image': { |
|
130 'id': img_data.get('ID'), |
|
131 'title': img_data.get('Title'), |
|
132 'description_text': description, |
|
133 'description_html': '<div>' + description + "</div>", |
|
134 'date_publication' : img_data.get('Date'), |
|
135 'credit': img_data.get('Credit') or 'Courtesy NASA/JPL-Caltech', |
|
136 'type': img_data.get('Type'), |
|
137 'imgurl': img_data.get('ResourceURL') |
|
138 }, |
|
139 'url': img_data.get('ReferenceURL') |
|
140 } |
|
141 |
|
142 with open(json_path, 'w') as outfile: |
|
143 json.dump(img_json, outfile) |
|
144 |
|
145 if img_ext != "jpg": |
|
146 avm.embed(img_path, img_path) |
|
147 |
|
148 except (KeyboardInterrupt, SystemExit): |
|
149 raise |
|
150 except: |
|
151 print("--> error processing %s" % (img_href,)) |
|
152 traceback.print_exc(file=sys.stdout) |
|
153 |
|
154 # ''' |
|
155 |
|
156 # MetadataVersion: b'1.2' |
|
157 # Creator: b'Spitzer Space Telescope' |
|
158 # Title: b'Surface of TRAPPIST-1f' |
|
159 # Headline: b"Imagine standing on the surface of the exoplanet TRAPPIST-1f. This artist's concept is one interpretation of what it could look like. " |
|
160 # Description: b"This artist's concept allows us to imagine what it would be like to stand on the surface of the exoplanet TRAPPIST-1f, located in the TRAPPIST-1 system in the constellation Aquarius. \n\nBecause this planet is thought to be tidally locked to its star, meaning the same face of the planet is always pointed at the star, there would be a region called the terminator that perpetually divides day and night. If the night side is icy, the day side might give way to liquid water in the area where sufficient starlight hits the surface. \n\nOne of the unusual features of TRAPPIST-1 planets is how close they are to each other -- so close that other planets could be visible in the sky from the surface of each one. In this view, the planets in the sky correspond to TRAPPIST1e (top left crescent), d (middle crescent) and c (bright dot to the lower right of the crescents). TRAPPIST-1e would appear about the same size as the moon and TRAPPIST1-c is on the far side of the star. The star itself, an ultra-cool dwarf, would appear about three times larger than our own sun does in Earth's skies.\n\nThe TRAPPIST-1 system has been revealed through observations from NASA's Spitzer Space Telescope and the ground-based TRAPPIST (TRAnsiting Planets and PlanetesImals Small Telescope) telescope, as well as other ground-based observatories. The system was named for the TRAPPIST telescope.\n\nNASA's Jet Propulsion Laboratory, Pasadena, California, manages the Spitzer Space Telescope mission for NASA's Science Mission Directorate, Washington. Science operations are conducted at the Spitzer Science Center at Caltech in Pasadena. Spacecraft operations are based at Lockheed Martin Space Systems Company, Littleton, Colorado. Data are archived at the Infrared Science Archive housed at Caltech/IPAC. Caltech manages JPL for NASA. " |
|
161 # ReferenceURL: b'http://www.spitzer.caltech.edu/images/6274-ssc2017-01c-Surface-of-TRAPPIST-1f' |
|
162 # Credit: b'NASA/JPL-Caltech/T. Pyle (IPAC)' |
|
163 # Date: b'2017-02-22' |
|
164 # ID: b'ssc2017-01c' |
|
165 # Type: b'Artwork' |
|
166 # Publisher: b'Spitzer Science Center' |
|
167 # PublisherID: b'spitzer' |
|
168 # ResourceID: b'ssc2017-01c.jpg' |
|
169 # ResourceURL: b'http://www.spitzer.caltech.edu/uploaded_files/images/0010/9932/ssc201 |
|
170 # ''' |