iconolab-episteme: data/script/scrap_pyavm_spitzer.py@8690bf2fb09a (annotated)

24 0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	1	import json
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	2	import os
25 8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	3	import os.path
24 0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	4	from urllib import request
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	5
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	6	import requests
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	7
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	8	from bs4 import BeautifulSoup
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	9	from pyavm import AVM
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	10	from pyavm.avm import AVMContainer
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	11
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	12	def convert_avm_container(s):
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	13	avm_items = s._items
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	14	return {
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	15	k : convert_avm_container(avm_items.get(k)) if isinstance(avm_items.get(k), AVMContainer) else avm_items.get(k)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	16	for k in avm_items.keys()
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	17	if avm_items.get(k) is not None
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	18	}
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	19	# res = {}
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	20	# for k in dir(s):
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	21	# v = getattr(s,k)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	22	# if v is None:
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	23	# continue
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	24	# if isinstance(v, AVMContainer):
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	25	# print("%s IS AVMContainer %r" % (k, v))
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	26	# res[k] = convert_avm_container(v)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	27	# else:
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	28	# res[k] = v
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	29	# return res
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	30
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	31
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	32	for page in range (1, 10):
25 8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	33	url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
24 0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	34	website = requests.get(url)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	35	soup = BeautifulSoup(website.content, "html5lib")
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	36
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	37	table = soup.find("table", {"class":"items"})
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	38	itemtds = table.find_all("td", {"class":"item"})
25 8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	39
8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	40	if len(itemtds) == 0:
8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	41	print("No item in page, exit.")
8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	42	break
8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	43
24 0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	44	for td in itemtds:
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	45	# itemlinks = find("a", href=True)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	46	# print(td.div.a.get('href'))
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	47	detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href')
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	48	detail_content = requests.get(detail_url).content
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	49	detail_soup = BeautifulSoup(detail_content, "html5lib")
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	50	img_id_elem = detail_soup.find("dd", {"property":"avm:ID"})
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	51	img_id = img_id_elem.getText()
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	52	img_url = None
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	53	img_box = detail_soup.find("div", {"class":"sidebar-section download"})
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	54	for img_link in img_box.find_all("a"):
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	55	img_link_href = img_link.get("href")
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	56	if img_link_href.endswith(img_id + ".jpg"):
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	57	img_url = "http://www.spitzer.caltech.edu" + img_link_href
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	58	break
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	59	print(img_url)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	60	if not img_url:
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	61	continue
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	62	if not os.path.isdir('scrapSpitzer/' + img_id):
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	63	os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	64	img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	65	json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
25 8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	66	if os.path.isfile(img_path):
8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	67	print("--> file %s exists from url %s" % (img_path,img_url))
8690bf2fb09a Small improvement on scrapSpitzer ymh <ymh.work@gmail.com> parents: 24 diff changeset	68	continue
24 0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	69	request.urlretrieve(img_url, img_path)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	70	avm = AVM.from_image(img_path)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	71	img_data = convert_avm_container(avm)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	72
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	73	img_json = {
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	74	'avm': img_data,
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	75	'image': {
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	76	'id': img_data.get('ID'),
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	77	'title': img_data.get('Title'),
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	78	'description_text': img_data.get('Description'),
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	79	'description_html': '<div>' + img_data.get('Description') + "</div>",
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	80	'date_publication' : img_data.get('Date'),
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	81	'credit': img_data.get('Credit') or 'Courtesy NASA/JPL-Caltech',
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	82	'type': img_data.get('Type'),
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	83	'imgurl': img_data.get('ResourceURL')
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	84	},
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	85	'url': img_data.get('ReferenceURL')
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	86	}
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	87
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	88	with open(json_path, 'w') as outfile:
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	89	json.dump(img_json, outfile)
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	90
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	91
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	92
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	93	'''
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	94
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	95	MetadataVersion: b'1.2'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	96	Creator: b'Spitzer Space Telescope'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	97	Title: b'Surface of TRAPPIST-1f'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	98	Headline: b"Imagine standing on the surface of the exoplanet TRAPPIST-1f. This artist's concept is one interpretation of what it could look like. "
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	99	Description: b"This artist's concept allows us to imagine what it would be like to stand on the surface of the exoplanet TRAPPIST-1f, located in the TRAPPIST-1 system in the constellation Aquarius. \n\nBecause this planet is thought to be tidally locked to its star, meaning the same face of the planet is always pointed at the star, there would be a region called the terminator that perpetually divides day and night. If the night side is icy, the day side might give way to liquid water in the area where sufficient starlight hits the surface. \n\nOne of the unusual features of TRAPPIST-1 planets is how close they are to each other -- so close that other planets could be visible in the sky from the surface of each one. In this view, the planets in the sky correspond to TRAPPIST1e (top left crescent), d (middle crescent) and c (bright dot to the lower right of the crescents). TRAPPIST-1e would appear about the same size as the moon and TRAPPIST1-c is on the far side of the star. The star itself, an ultra-cool dwarf, would appear about three times larger than our own sun does in Earth's skies.\n\nThe TRAPPIST-1 system has been revealed through observations from NASA's Spitzer Space Telescope and the ground-based TRAPPIST (TRAnsiting Planets and PlanetesImals Small Telescope) telescope, as well as other ground-based observatories. The system was named for the TRAPPIST telescope.\n\nNASA's Jet Propulsion Laboratory, Pasadena, California, manages the Spitzer Space Telescope mission for NASA's Science Mission Directorate, Washington. Science operations are conducted at the Spitzer Science Center at Caltech in Pasadena. Spacecraft operations are based at Lockheed Martin Space Systems Company, Littleton, Colorado. Data are archived at the Infrared Science Archive housed at Caltech/IPAC. Caltech manages JPL for NASA. "
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	100	ReferenceURL: b'http://www.spitzer.caltech.edu/images/6274-ssc2017-01c-Surface-of-TRAPPIST-1f'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	101	Credit: b'NASA/JPL-Caltech/T. Pyle (IPAC)'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	102	Date: b'2017-02-22'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	103	ID: b'ssc2017-01c'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	104	Type: b'Artwork'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	105	Publisher: b'Spitzer Science Center'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	106	PublisherID: b'spitzer'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	107	ResourceID: b'ssc2017-01c.jpg'
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	108	ResourceURL: b'http://www.spitzer.caltech.edu/uploaded_files/images/0010/9932/ssc201
0c9c840b82dc add spitzer scraping ymh <ymh.work@gmail.com> parents: diff changeset	109	'''

author	ymh <ymh.work@gmail.com>
	Mon, 30 Jul 2018 14:21:28 +0200
changeset 25	8690bf2fb09a
parent 24	0c9c840b82dc
child 26	957d03d2bc26
permissions	-rw-r--r--