iconolab-episteme: data/script/scrap_pyavm


import glob
import json
import os
import os.path
import re
import shutil
import sys
import traceback
from urllib import parse as urlparse
from urllib import request

import requests

from bs4 import BeautifulSoup
from PIL import Image
from pyavm import AVM
from pyavm.avm import AVMContainer


def convert_avm_container(s):
    avm_items = s._items
    return {
        k : convert_avm_container(avm_items.get(k)) if isinstance(avm_items.get(k), AVMContainer) else avm_items.get(k)
        for k in avm_items.keys()
        if avm_items.get(k) is not None
    }
    # res = {}
    # for k in dir(s):
    #     v = getattr(s,k)
    #     if v is None:
    #         continue
    #     if isinstance(v, AVMContainer):
    #         print("%s IS AVMContainer %r" % (k, v))
    #         res[k] = convert_avm_container(v)
    #     else:
    #         res[k] = v
    # return res
IMG_LINK_RE = re.compile(r"(\d+)\s*x\s*(\d+)\s")

def parse_img_link_size(img_link_txt):
    m = IMG_LINK_RE.search(img_link_txt)
    return int(m.group(1))*int(m.group(2)) if m else 0

for page in range (1, 10):
    url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
    website = requests.get(url)
    soup = BeautifulSoup(website.content, "html5lib")

    table = soup.find("table", {"class":"items"})
    itemtds = table.find_all("td", {"class":"item"})

    if len(itemtds) == 0:
        print("No item in page, exit.")
        break

    for td in itemtds:
    # itemlinks = find("a", href=True)
        img_href = td.div.a.get('href')
        print("IMG HREF %s" % img_href)
        img_slug = img_href.split("/")[-1]
        img_dir_path = 'scrapSpitzer/' + img_slug
        if os.path.isdir(img_dir_path) and glob.glob("%s/*.jpg"%img_dir_path) and glob.glob("%s/*.json"%img_dir_path):
            print("--> img %s exists" % (img_slug,))
            continue
        else:
            os.makedirs(img_dir_path, exist_ok=True)

        detail_url = "http://www.spitzer.caltech.edu" + img_href

        try:
            detail_content = requests.get(detail_url).content
            detail_soup = BeautifulSoup(detail_content, "html5lib")
            img_id_elem = detail_soup.find("dd", {"property":"avm:ID"})
            img_id = img_id_elem.getText().strip() if img_id_elem else None
            if not img_id:
                print("--> img %s has no id, skipping" % img_href)
                continue
            img_url = None
            img_box = detail_soup.find("div", {"class":"sidebar-section download"})

            img_link_href_array = [ 
                (img_link.get("href"), parse_img_link_size(img_link.getText()))
                for img_link in img_box.find_all("a") ]

            for img_link_href in img_link_href_array:
                if img_link_href[0].endswith(img_id + ".jpg"):
                    img_url = "http://www.spitzer.caltech.edu" + img_link_href[0]
                    break

            if not img_url:
                jpeg_img_link_href_array = sorted(filter(lambda l: l[0].endswith(".jpg"), img_link_href_array), key=lambda e: e[1], reverse=True)
                if jpeg_img_link_href_array:
                    img_url = "http://www.spitzer.caltech.edu" + jpeg_img_link_href_array[0][0]


            if not img_url:
                tiff_img_link_href_array = sorted(filter(lambda l: l[0].endswith(".tif"), img_link_href_array), key=lambda e: e[1], reverse=True)
                if tiff_img_link_href_array:
                    img_url = "http://www.spitzer.caltech.edu" + tiff_img_link_href_array[0][0]


            if not img_url:
                print("NO IMG URL for %s : %r" % (img_id, img_link_href_array))
                continue
            print("IMG URL %s" % img_url)
            p = urlparse.urlparse(img_url).path
            img_ext = os.path.splitext(p)[1]
            orig_img_path = 'scrapSpitzer/{0}/{1}{2}'.format(img_slug, img_id, img_ext)
            img_path = 'scrapSpitzer/{0}/{1}.jpg'.format(img_slug, img_id)
            json_path = 'scrapSpitzer/{0}/{1}.json'.format(img_slug, img_id)
            if os.path.isfile(img_path) and os.path.isfile(json_path):
                print("--> file %s exists from url %s" % (img_path,img_url))
                continue
            
            if not os.path.isfile(orig_img_path):
                request.urlretrieve(img_url, orig_img_path)

            if img_ext != "jpg" and not os.path.isfile(img_path) :
                im = Image.open(orig_img_path)
                rgb_im = im.convert('RGB')
                rgb_im.save(img_path)

            avm = AVM.from_image(orig_img_path)
            img_data = convert_avm_container(avm)
            description = img_data.get('Description') or ""

            img_json = { 
                'avm': img_data,
                'image': {
                    'id': img_data.get('ID'),
                    'title': img_data.get('Title'),
                    'description_text': description,
                    'description_html': '<div>' + description + "</div>",
                    'date_publication' : img_data.get('Date'),
                    'credit': img_data.get('Credit') or 'Courtesy NASA/JPL-Caltech',
                    'type': img_data.get('Type'),
                    'imgurl': img_data.get('ResourceURL')
                },
                'url': img_data.get('ReferenceURL')
            }

            with open(json_path, 'w') as outfile:
                json.dump(img_json, outfile)

            if img_ext != "jpg":
                avm.embed(img_path, img_path)

        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            print("--> error processing %s" % (img_href,))
            traceback.print_exc(file=sys.stdout)

# '''

# MetadataVersion: b'1.2'
# Creator: b'Spitzer Space Telescope'
# Title: b'Surface of TRAPPIST-1f'
# Headline: b"Imagine standing on the surface of the exoplanet TRAPPIST-1f. This artist's concept is one interpretation of what it could look like. "
# Description: b"This artist's concept allows us to imagine what it would be like to stand on the surface of the exoplanet TRAPPIST-1f, located in the TRAPPIST-1 system in the constellation Aquarius. \n\nBecause this planet is thought to be tidally locked to its star, meaning the same face of the planet is always pointed at the star, there would be a region called the terminator that perpetually divides day and night. If the night side is icy, the day side might give way to liquid water in the area where sufficient starlight hits the surface. \n\nOne of the unusual features of TRAPPIST-1 planets is how close they are to each other -- so close that other planets could be visible in the sky from the surface of each one. In this view, the planets in the sky correspond to TRAPPIST1e (top left crescent), d (middle crescent) and c (bright dot to the lower right of the crescents). TRAPPIST-1e would appear about the same size as the moon and TRAPPIST1-c is on the far side of the star. The star itself, an ultra-cool dwarf, would appear about three times larger than our own sun does in Earth's skies.\n\nThe TRAPPIST-1 system has been revealed through observations from NASA's Spitzer Space Telescope and the ground-based TRAPPIST (TRAnsiting Planets and PlanetesImals Small Telescope) telescope, as well as other ground-based observatories. The system was named for the TRAPPIST telescope.\n\nNASA's Jet Propulsion Laboratory, Pasadena, California, manages the Spitzer Space Telescope mission for NASA's Science Mission Directorate, Washington. Science operations are conducted at the Spitzer Science Center at Caltech in Pasadena. Spacecraft operations are based at Lockheed Martin Space Systems Company, Littleton, Colorado. Data are archived at the Infrared Science Archive housed at Caltech/IPAC. Caltech manages JPL for NASA. "
# ReferenceURL: b'http://www.spitzer.caltech.edu/images/6274-ssc2017-01c-Surface-of-TRAPPIST-1f'
# Credit: b'NASA/JPL-Caltech/T. Pyle (IPAC)'
# Date: b'2017-02-22'
# ID: b'ssc2017-01c'
# Type: b'Artwork'
# Publisher: b'Spitzer Science Center'
# PublisherID: b'spitzer'
# ResourceID: b'ssc2017-01c.jpg'
# ResourceURL: b'http://www.spitzer.caltech.edu/uploaded_files/images/0010/9932/ssc201
# '''
author	ymh <ymh.work@gmail.com>
	Wed, 01 Aug 2018 14:43:44 +0200
changeset 29	9de311703ab9
parent 26	957d03d2bc26
permissions	-rw-r--r--