data/script/scrap_pyavm_spitzer.py
author ymh <ymh.work@gmail.com>
Mon, 30 Jul 2018 14:21:28 +0200
changeset 25 8690bf2fb09a
parent 24 0c9c840b82dc
child 26 957d03d2bc26
permissions -rw-r--r--
Small improvement on scrapSpitzer
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
import json
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import os
25
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
     3
import os.path
24
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
from urllib import request
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
import requests
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
from bs4 import BeautifulSoup
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
from pyavm import AVM
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
from pyavm.avm import AVMContainer
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
def convert_avm_container(s):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
    avm_items = s._items
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
    return {
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
        k : convert_avm_container(avm_items.get(k)) if isinstance(avm_items.get(k), AVMContainer) else avm_items.get(k)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
        for k in avm_items.keys()
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
        if avm_items.get(k) is not None
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
    }
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
    # res = {}
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
    # for k in dir(s):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
    #     v = getattr(s,k)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
    #     if v is None:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
    #         continue
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
    #     if isinstance(v, AVMContainer):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
    #         print("%s IS AVMContainer %r" % (k, v))
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
    #         res[k] = convert_avm_container(v)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
    #     else:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
    #         res[k] = v
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
    # return res
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
for page in range (1, 10):
25
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    33
    url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
24
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
    website = requests.get(url)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
    soup = BeautifulSoup(website.content, "html5lib")
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
    table = soup.find("table", {"class":"items"})
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
    itemtds = table.find_all("td", {"class":"item"})
25
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    39
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    40
    if len(itemtds) == 0:
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    41
        print("No item in page, exit.")
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    42
        break
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    43
24
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
    for td in itemtds:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
    # itemlinks = find("a", href=True)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
        # print(td.div.a.get('href'))
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
        detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href')
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
        detail_content = requests.get(detail_url).content
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
        detail_soup = BeautifulSoup(detail_content, "html5lib")
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
        img_id_elem = detail_soup.find("dd", {"property":"avm:ID"})
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
        img_id = img_id_elem.getText()
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
        img_url = None
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
        img_box = detail_soup.find("div", {"class":"sidebar-section download"})
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
        for img_link in img_box.find_all("a"):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
            img_link_href = img_link.get("href")
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
            if img_link_href.endswith(img_id + ".jpg"):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
                img_url = "http://www.spitzer.caltech.edu" + img_link_href
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
                break
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
        print(img_url)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
        if not img_url:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
            continue
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
        if not os.path.isdir('scrapSpitzer/' + img_id):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
            os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
        img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
        json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
25
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    66
        if os.path.isfile(img_path):
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    67
            print("--> file %s exists from url %s" % (img_path,img_url))
8690bf2fb09a Small improvement on scrapSpitzer
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    68
            continue
24
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
        request.urlretrieve(img_url, img_path)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
        avm = AVM.from_image(img_path)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
        img_data = convert_avm_container(avm)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
        img_json = { 
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
            'avm': img_data,
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
            'image': {
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
                'id': img_data.get('ID'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
                'title': img_data.get('Title'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                'description_text': img_data.get('Description'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                'description_html': '<div>' + img_data.get('Description') + "</div>",
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
                'date_publication' : img_data.get('Date'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
                'credit': img_data.get('Credit') or 'Courtesy NASA/JPL-Caltech',
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
                'type': img_data.get('Type'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
                'imgurl': img_data.get('ResourceURL')
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
            },
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
            'url': img_data.get('ReferenceURL')
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
        }
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
        with open(json_path, 'w') as outfile:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
            json.dump(img_json, outfile)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
'''
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
MetadataVersion: b'1.2'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
Creator: b'Spitzer Space Telescope'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
Title: b'Surface of TRAPPIST-1f'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
Headline: b"Imagine standing on the surface of the exoplanet TRAPPIST-1f. This artist's concept is one interpretation of what it could look like. "
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
Description: b"This artist's concept allows us to imagine what it would be like to stand on the surface of the exoplanet TRAPPIST-1f, located in the TRAPPIST-1 system in the constellation Aquarius. \n\nBecause this planet is thought to be tidally locked to its star, meaning the same face of the planet is always pointed at the star, there would be a region called the terminator that perpetually divides day and night. If the night side is icy, the day side might give way to liquid water in the area where sufficient starlight hits the surface. \n\nOne of the unusual features of TRAPPIST-1 planets is how close they are to each other -- so close that other planets could be visible in the sky from the surface of each one. In this view, the planets in the sky correspond to TRAPPIST1e (top left crescent), d (middle crescent) and c (bright dot to the lower right of the crescents). TRAPPIST-1e would appear about the same size as the moon and TRAPPIST1-c is on the far side of the star. The star itself, an ultra-cool dwarf, would appear about three times larger than our own sun does in Earth's skies.\n\nThe TRAPPIST-1 system has been revealed through observations from NASA's Spitzer Space Telescope and the ground-based TRAPPIST (TRAnsiting Planets and PlanetesImals Small Telescope) telescope, as well as other ground-based observatories. The system was named for the TRAPPIST telescope.\n\nNASA's Jet Propulsion Laboratory, Pasadena, California, manages the Spitzer Space Telescope mission for NASA's Science Mission Directorate, Washington. Science operations are conducted at the Spitzer Science Center at Caltech in Pasadena. Spacecraft operations are based at Lockheed Martin Space Systems Company, Littleton, Colorado. Data are archived at the Infrared Science Archive housed at Caltech/IPAC. Caltech manages JPL for NASA. "
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
ReferenceURL: b'http://www.spitzer.caltech.edu/images/6274-ssc2017-01c-Surface-of-TRAPPIST-1f'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
Credit: b'NASA/JPL-Caltech/T. Pyle (IPAC)'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
Date: b'2017-02-22'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
ID: b'ssc2017-01c'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
Type: b'Artwork'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
Publisher: b'Spitzer Science Center'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
PublisherID: b'spitzer'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
ResourceID: b'ssc2017-01c.jpg'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
ResourceURL: b'http://www.spitzer.caltech.edu/uploaded_files/images/0010/9932/ssc201
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
'''