data/script/scrap_pyavm_spitzer.py
changeset 26 957d03d2bc26
parent 25 8690bf2fb09a
equal deleted inserted replaced
25:8690bf2fb09a 26:957d03d2bc26
       
     1 import glob
     1 import json
     2 import json
     2 import os
     3 import os
     3 import os.path
     4 import os.path
       
     5 import re
       
     6 import shutil
       
     7 import sys
       
     8 import traceback
       
     9 from urllib import parse as urlparse
     4 from urllib import request
    10 from urllib import request
     5 
    11 
     6 import requests
    12 import requests
     7 
    13 
     8 from bs4 import BeautifulSoup
    14 from bs4 import BeautifulSoup
       
    15 from PIL import Image
     9 from pyavm import AVM
    16 from pyavm import AVM
    10 from pyavm.avm import AVMContainer
    17 from pyavm.avm import AVMContainer
       
    18 
    11 
    19 
    12 def convert_avm_container(s):
    20 def convert_avm_container(s):
    13     avm_items = s._items
    21     avm_items = s._items
    14     return {
    22     return {
    15         k : convert_avm_container(avm_items.get(k)) if isinstance(avm_items.get(k), AVMContainer) else avm_items.get(k)
    23         k : convert_avm_container(avm_items.get(k)) if isinstance(avm_items.get(k), AVMContainer) else avm_items.get(k)
    25     #         print("%s IS AVMContainer %r" % (k, v))
    33     #         print("%s IS AVMContainer %r" % (k, v))
    26     #         res[k] = convert_avm_container(v)
    34     #         res[k] = convert_avm_container(v)
    27     #     else:
    35     #     else:
    28     #         res[k] = v
    36     #         res[k] = v
    29     # return res
    37     # return res
       
    38 IMG_LINK_RE = re.compile(r"(\d+)\s*x\s*(\d+)\s")
    30 
    39 
       
    40 def parse_img_link_size(img_link_txt):
       
    41     m = IMG_LINK_RE.search(img_link_txt)
       
    42     return int(m.group(1))*int(m.group(2)) if m else 0
    31 
    43 
    32 for page in range (1, 10):
    44 for page in range (1, 10):
    33     url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
    45     url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
    34     website = requests.get(url)
    46     website = requests.get(url)
    35     soup = BeautifulSoup(website.content, "html5lib")
    47     soup = BeautifulSoup(website.content, "html5lib")
    41         print("No item in page, exit.")
    53         print("No item in page, exit.")
    42         break
    54         break
    43 
    55 
    44     for td in itemtds:
    56     for td in itemtds:
    45     # itemlinks = find("a", href=True)
    57     # itemlinks = find("a", href=True)
    46         # print(td.div.a.get('href'))
    58         img_href = td.div.a.get('href')
    47         detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href')
    59         print("IMG HREF %s" % img_href)
    48         detail_content = requests.get(detail_url).content
    60         img_slug = img_href.split("/")[-1]
    49         detail_soup = BeautifulSoup(detail_content, "html5lib")
    61         img_dir_path = 'scrapSpitzer/' + img_slug
    50         img_id_elem = detail_soup.find("dd", {"property":"avm:ID"})
    62         if os.path.isdir(img_dir_path) and glob.glob("%s/*.jpg"%img_dir_path) and glob.glob("%s/*.json"%img_dir_path):
    51         img_id = img_id_elem.getText()
    63             print("--> img %s exists" % (img_slug,))
    52         img_url = None
       
    53         img_box = detail_soup.find("div", {"class":"sidebar-section download"})
       
    54         for img_link in img_box.find_all("a"):
       
    55             img_link_href = img_link.get("href")
       
    56             if img_link_href.endswith(img_id + ".jpg"):
       
    57                 img_url = "http://www.spitzer.caltech.edu" + img_link_href
       
    58                 break
       
    59         print(img_url)
       
    60         if not img_url:
       
    61             continue
    64             continue
    62         if not os.path.isdir('scrapSpitzer/' + img_id):
    65         else:
    63             os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
    66             os.makedirs(img_dir_path, exist_ok=True)
    64         img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
       
    65         json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
       
    66         if os.path.isfile(img_path):
       
    67             print("--> file %s exists from url %s" % (img_path,img_url))
       
    68             continue
       
    69         request.urlretrieve(img_url, img_path)
       
    70         avm = AVM.from_image(img_path)
       
    71         img_data = convert_avm_container(avm)
       
    72 
    67 
    73         img_json = { 
    68         detail_url = "http://www.spitzer.caltech.edu" + img_href
    74             'avm': img_data,
       
    75             'image': {
       
    76                 'id': img_data.get('ID'),
       
    77                 'title': img_data.get('Title'),
       
    78                 'description_text': img_data.get('Description'),
       
    79                 'description_html': '<div>' + img_data.get('Description') + "</div>",
       
    80                 'date_publication' : img_data.get('Date'),
       
    81                 'credit': img_data.get('Credit') or 'Courtesy NASA/JPL-Caltech',
       
    82                 'type': img_data.get('Type'),
       
    83                 'imgurl': img_data.get('ResourceURL')
       
    84             },
       
    85             'url': img_data.get('ReferenceURL')
       
    86         }
       
    87 
    69 
    88         with open(json_path, 'w') as outfile:
    70         try:
    89             json.dump(img_json, outfile)
    71             detail_content = requests.get(detail_url).content
       
    72             detail_soup = BeautifulSoup(detail_content, "html5lib")
       
    73             img_id_elem = detail_soup.find("dd", {"property":"avm:ID"})
       
    74             img_id = img_id_elem.getText().strip() if img_id_elem else None
       
    75             if not img_id:
       
    76                 print("--> img %s has no id, skipping" % img_href)
       
    77                 continue
       
    78             img_url = None
       
    79             img_box = detail_soup.find("div", {"class":"sidebar-section download"})
       
    80 
       
    81             img_link_href_array = [ 
       
    82                 (img_link.get("href"), parse_img_link_size(img_link.getText()))
       
    83                 for img_link in img_box.find_all("a") ]
       
    84 
       
    85             for img_link_href in img_link_href_array:
       
    86                 if img_link_href[0].endswith(img_id + ".jpg"):
       
    87                     img_url = "http://www.spitzer.caltech.edu" + img_link_href[0]
       
    88                     break
       
    89 
       
    90             if not img_url:
       
    91                 jpeg_img_link_href_array = sorted(filter(lambda l: l[0].endswith(".jpg"), img_link_href_array), key=lambda e: e[1], reverse=True)
       
    92                 if jpeg_img_link_href_array:
       
    93                     img_url = "http://www.spitzer.caltech.edu" + jpeg_img_link_href_array[0][0]
    90 
    94 
    91 
    95 
       
    96             if not img_url:
       
    97                 tiff_img_link_href_array = sorted(filter(lambda l: l[0].endswith(".tif"), img_link_href_array), key=lambda e: e[1], reverse=True)
       
    98                 if tiff_img_link_href_array:
       
    99                     img_url = "http://www.spitzer.caltech.edu" + tiff_img_link_href_array[0][0]
    92 
   100 
    93 '''
       
    94 
   101 
    95 MetadataVersion: b'1.2'
   102             if not img_url:
    96 Creator: b'Spitzer Space Telescope'
   103                 print("NO IMG URL for %s : %r" % (img_id, img_link_href_array))
    97 Title: b'Surface of TRAPPIST-1f'
   104                 continue
    98 Headline: b"Imagine standing on the surface of the exoplanet TRAPPIST-1f. This artist's concept is one interpretation of what it could look like. "
   105             print("IMG URL %s" % img_url)
    99 Description: b"This artist's concept allows us to imagine what it would be like to stand on the surface of the exoplanet TRAPPIST-1f, located in the TRAPPIST-1 system in the constellation Aquarius. \n\nBecause this planet is thought to be tidally locked to its star, meaning the same face of the planet is always pointed at the star, there would be a region called the terminator that perpetually divides day and night. If the night side is icy, the day side might give way to liquid water in the area where sufficient starlight hits the surface. \n\nOne of the unusual features of TRAPPIST-1 planets is how close they are to each other -- so close that other planets could be visible in the sky from the surface of each one. In this view, the planets in the sky correspond to TRAPPIST1e (top left crescent), d (middle crescent) and c (bright dot to the lower right of the crescents). TRAPPIST-1e would appear about the same size as the moon and TRAPPIST1-c is on the far side of the star. The star itself, an ultra-cool dwarf, would appear about three times larger than our own sun does in Earth's skies.\n\nThe TRAPPIST-1 system has been revealed through observations from NASA's Spitzer Space Telescope and the ground-based TRAPPIST (TRAnsiting Planets and PlanetesImals Small Telescope) telescope, as well as other ground-based observatories. The system was named for the TRAPPIST telescope.\n\nNASA's Jet Propulsion Laboratory, Pasadena, California, manages the Spitzer Space Telescope mission for NASA's Science Mission Directorate, Washington. Science operations are conducted at the Spitzer Science Center at Caltech in Pasadena. Spacecraft operations are based at Lockheed Martin Space Systems Company, Littleton, Colorado. Data are archived at the Infrared Science Archive housed at Caltech/IPAC. Caltech manages JPL for NASA. "
   106             p = urlparse.urlparse(img_url).path
   100 ReferenceURL: b'http://www.spitzer.caltech.edu/images/6274-ssc2017-01c-Surface-of-TRAPPIST-1f'
   107             img_ext = os.path.splitext(p)[1]
   101 Credit: b'NASA/JPL-Caltech/T. Pyle (IPAC)'
   108             orig_img_path = 'scrapSpitzer/{0}/{1}{2}'.format(img_slug, img_id, img_ext)
   102 Date: b'2017-02-22'
   109             img_path = 'scrapSpitzer/{0}/{1}.jpg'.format(img_slug, img_id)
   103 ID: b'ssc2017-01c'
   110             json_path = 'scrapSpitzer/{0}/{1}.json'.format(img_slug, img_id)
   104 Type: b'Artwork'
   111             if os.path.isfile(img_path) and os.path.isfile(json_path):
   105 Publisher: b'Spitzer Science Center'
   112                 print("--> file %s exists from url %s" % (img_path,img_url))
   106 PublisherID: b'spitzer'
   113                 continue
   107 ResourceID: b'ssc2017-01c.jpg'
   114             
   108 ResourceURL: b'http://www.spitzer.caltech.edu/uploaded_files/images/0010/9932/ssc201
   115             if not os.path.isfile(orig_img_path):
   109 '''
   116                 request.urlretrieve(img_url, orig_img_path)
       
   117 
       
   118             if img_ext != "jpg" and not os.path.isfile(img_path) :
       
   119                 im = Image.open(orig_img_path)
       
   120                 rgb_im = im.convert('RGB')
       
   121                 rgb_im.save(img_path)
       
   122 
       
   123             avm = AVM.from_image(orig_img_path)
       
   124             img_data = convert_avm_container(avm)
       
   125             description = img_data.get('Description') or ""
       
   126 
       
   127             img_json = { 
       
   128                 'avm': img_data,
       
   129                 'image': {
       
   130                     'id': img_data.get('ID'),
       
   131                     'title': img_data.get('Title'),
       
   132                     'description_text': description,
       
   133                     'description_html': '<div>' + description + "</div>",
       
   134                     'date_publication' : img_data.get('Date'),
       
   135                     'credit': img_data.get('Credit') or 'Courtesy NASA/JPL-Caltech',
       
   136                     'type': img_data.get('Type'),
       
   137                     'imgurl': img_data.get('ResourceURL')
       
   138                 },
       
   139                 'url': img_data.get('ReferenceURL')
       
   140             }
       
   141 
       
   142             with open(json_path, 'w') as outfile:
       
   143                 json.dump(img_json, outfile)
       
   144 
       
   145             if img_ext != "jpg":
       
   146                 avm.embed(img_path, img_path)
       
   147 
       
   148         except (KeyboardInterrupt, SystemExit):
       
   149             raise
       
   150         except:
       
   151             print("--> error processing %s" % (img_href,))
       
   152             traceback.print_exc(file=sys.stdout)
       
   153 
       
   154 # '''
       
   155 
       
   156 # MetadataVersion: b'1.2'
       
   157 # Creator: b'Spitzer Space Telescope'
       
   158 # Title: b'Surface of TRAPPIST-1f'
       
   159 # Headline: b"Imagine standing on the surface of the exoplanet TRAPPIST-1f. This artist's concept is one interpretation of what it could look like. "
       
   160 # Description: b"This artist's concept allows us to imagine what it would be like to stand on the surface of the exoplanet TRAPPIST-1f, located in the TRAPPIST-1 system in the constellation Aquarius. \n\nBecause this planet is thought to be tidally locked to its star, meaning the same face of the planet is always pointed at the star, there would be a region called the terminator that perpetually divides day and night. If the night side is icy, the day side might give way to liquid water in the area where sufficient starlight hits the surface. \n\nOne of the unusual features of TRAPPIST-1 planets is how close they are to each other -- so close that other planets could be visible in the sky from the surface of each one. In this view, the planets in the sky correspond to TRAPPIST1e (top left crescent), d (middle crescent) and c (bright dot to the lower right of the crescents). TRAPPIST-1e would appear about the same size as the moon and TRAPPIST1-c is on the far side of the star. The star itself, an ultra-cool dwarf, would appear about three times larger than our own sun does in Earth's skies.\n\nThe TRAPPIST-1 system has been revealed through observations from NASA's Spitzer Space Telescope and the ground-based TRAPPIST (TRAnsiting Planets and PlanetesImals Small Telescope) telescope, as well as other ground-based observatories. The system was named for the TRAPPIST telescope.\n\nNASA's Jet Propulsion Laboratory, Pasadena, California, manages the Spitzer Space Telescope mission for NASA's Science Mission Directorate, Washington. Science operations are conducted at the Spitzer Science Center at Caltech in Pasadena. Spacecraft operations are based at Lockheed Martin Space Systems Company, Littleton, Colorado. Data are archived at the Infrared Science Archive housed at Caltech/IPAC. Caltech manages JPL for NASA. "
       
   161 # ReferenceURL: b'http://www.spitzer.caltech.edu/images/6274-ssc2017-01c-Surface-of-TRAPPIST-1f'
       
   162 # Credit: b'NASA/JPL-Caltech/T. Pyle (IPAC)'
       
   163 # Date: b'2017-02-22'
       
   164 # ID: b'ssc2017-01c'
       
   165 # Type: b'Artwork'
       
   166 # Publisher: b'Spitzer Science Center'
       
   167 # PublisherID: b'spitzer'
       
   168 # ResourceID: b'ssc2017-01c.jpg'
       
   169 # ResourceURL: b'http://www.spitzer.caltech.edu/uploaded_files/images/0010/9932/ssc201
       
   170 # '''