data/script/scrap_pyavm_spitzer.py
author ymh <ymh.work@gmail.com>
Mon, 30 Jul 2018 14:00:40 +0200
changeset 24 0c9c840b82dc
child 25 8690bf2fb09a
permissions -rw-r--r--
add spitzer scraping
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
import json
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import os
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
from urllib import request
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
import requests
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
from bs4 import BeautifulSoup
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
from pyavm import AVM
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
from pyavm.avm import AVMContainer
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
def convert_avm_container(s):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
    avm_items = s._items
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
    return {
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
        k : convert_avm_container(avm_items.get(k)) if isinstance(avm_items.get(k), AVMContainer) else avm_items.get(k)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
        for k in avm_items.keys()
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
        if avm_items.get(k) is not None
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
    }
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
    # res = {}
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
    # for k in dir(s):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
    #     v = getattr(s,k)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
    #     if v is None:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
    #         continue
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
    #     if isinstance(v, AVMContainer):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
    #         print("%s IS AVMContainer %r" % (k, v))
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
    #         res[k] = convert_avm_container(v)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
    #     else:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
    #         res[k] = v
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
    # return res
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
for page in range (1, 10):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
    url = "http://www.spitzer.caltech.edu/search/image_set/20?resource=image_set&tabs=hidden&page={}".format(page)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
    website = requests.get(url)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
    soup = BeautifulSoup(website.content, "html5lib")
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
    table = soup.find("table", {"class":"items"})
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
    itemtds = table.find_all("td", {"class":"item"})
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
    for td in itemtds:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
    # itemlinks = find("a", href=True)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
        # print(td.div.a.get('href'))
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
        detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href')
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
        detail_content = requests.get(detail_url).content
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
        detail_soup = BeautifulSoup(detail_content, "html5lib")
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
        img_id_elem = detail_soup.find("dd", {"property":"avm:ID"})
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
        img_id = img_id_elem.getText()
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
        img_url = None
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
        img_box = detail_soup.find("div", {"class":"sidebar-section download"})
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
        for img_link in img_box.find_all("a"):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
            img_link_href = img_link.get("href")
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
            if img_link_href.endswith(img_id + ".jpg"):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
                img_url = "http://www.spitzer.caltech.edu" + img_link_href
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
                break
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
        print(img_url)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
        if not img_url:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
            continue
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
        if not os.path.isdir('scrapSpitzer/' + img_id):
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
            os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
        img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
        json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
        request.urlretrieve(img_url, img_path)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
        avm = AVM.from_image(img_path)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
        img_data = convert_avm_container(avm)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
        img_json = { 
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
            'avm': img_data,
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
            'image': {
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
                'id': img_data.get('ID'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
                'title': img_data.get('Title'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
                'description_text': img_data.get('Description'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
                'description_html': '<div>' + img_data.get('Description') + "</div>",
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
                'date_publication' : img_data.get('Date'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
                'credit': img_data.get('Credit') or 'Courtesy NASA/JPL-Caltech',
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                'type': img_data.get('Type'),
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
                'imgurl': img_data.get('ResourceURL')
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
            },
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
            'url': img_data.get('ReferenceURL')
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
        }
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
        with open(json_path, 'w') as outfile:
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
            json.dump(img_json, outfile)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
    break
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
# avm = AVM.from_image('eso1238a.jpg')
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
# print(avm)
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
'''
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
MetadataVersion: b'1.2'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
Creator: b'Spitzer Space Telescope'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
Title: b'Surface of TRAPPIST-1f'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
Headline: b"Imagine standing on the surface of the exoplanet TRAPPIST-1f. This artist's concept is one interpretation of what it could look like. "
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
Description: b"This artist's concept allows us to imagine what it would be like to stand on the surface of the exoplanet TRAPPIST-1f, located in the TRAPPIST-1 system in the constellation Aquarius. \n\nBecause this planet is thought to be tidally locked to its star, meaning the same face of the planet is always pointed at the star, there would be a region called the terminator that perpetually divides day and night. If the night side is icy, the day side might give way to liquid water in the area where sufficient starlight hits the surface. \n\nOne of the unusual features of TRAPPIST-1 planets is how close they are to each other -- so close that other planets could be visible in the sky from the surface of each one. In this view, the planets in the sky correspond to TRAPPIST1e (top left crescent), d (middle crescent) and c (bright dot to the lower right of the crescents). TRAPPIST-1e would appear about the same size as the moon and TRAPPIST1-c is on the far side of the star. The star itself, an ultra-cool dwarf, would appear about three times larger than our own sun does in Earth's skies.\n\nThe TRAPPIST-1 system has been revealed through observations from NASA's Spitzer Space Telescope and the ground-based TRAPPIST (TRAnsiting Planets and PlanetesImals Small Telescope) telescope, as well as other ground-based observatories. The system was named for the TRAPPIST telescope.\n\nNASA's Jet Propulsion Laboratory, Pasadena, California, manages the Spitzer Space Telescope mission for NASA's Science Mission Directorate, Washington. Science operations are conducted at the Spitzer Science Center at Caltech in Pasadena. Spacecraft operations are based at Lockheed Martin Space Systems Company, Littleton, Colorado. Data are archived at the Infrared Science Archive housed at Caltech/IPAC. Caltech manages JPL for NASA. "
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
ReferenceURL: b'http://www.spitzer.caltech.edu/images/6274-ssc2017-01c-Surface-of-TRAPPIST-1f'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
Credit: b'NASA/JPL-Caltech/T. Pyle (IPAC)'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
Date: b'2017-02-22'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
ID: b'ssc2017-01c'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
Type: b'Artwork'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
Publisher: b'Spitzer Science Center'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
PublisherID: b'spitzer'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
ResourceID: b'ssc2017-01c.jpg'
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
ResourceURL: b'http://www.spitzer.caltech.edu/uploaded_files/images/0010/9932/ssc201
0c9c840b82dc add spitzer scraping
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
'''