data/script/scrap_pyavm_spitzer.py
changeset 25 8690bf2fb09a
parent 24 0c9c840b82dc
child 26 957d03d2bc26
equal deleted inserted replaced
24:0c9c840b82dc 25:8690bf2fb09a
     1 import json
     1 import json
     2 import os
     2 import os
       
     3 import os.path
     3 from urllib import request
     4 from urllib import request
     4 
     5 
     5 import requests
     6 import requests
     6 
     7 
     7 from bs4 import BeautifulSoup
     8 from bs4 import BeautifulSoup
    27     #         res[k] = v
    28     #         res[k] = v
    28     # return res
    29     # return res
    29 
    30 
    30 
    31 
    31 for page in range (1, 10):
    32 for page in range (1, 10):
    32     url = "http://www.spitzer.caltech.edu/search/image_set/20?resource=image_set&tabs=hidden&page={}".format(page)
    33     url = "http://www.spitzer.caltech.edu/search/image_set/100?resource=image_set&tabs=hidden&page={}".format(page)
    33     website = requests.get(url)
    34     website = requests.get(url)
    34     soup = BeautifulSoup(website.content, "html5lib")
    35     soup = BeautifulSoup(website.content, "html5lib")
    35 
    36 
    36     table = soup.find("table", {"class":"items"})
    37     table = soup.find("table", {"class":"items"})
    37     itemtds = table.find_all("td", {"class":"item"})
    38     itemtds = table.find_all("td", {"class":"item"})
       
    39 
       
    40     if len(itemtds) == 0:
       
    41         print("No item in page, exit.")
       
    42         break
       
    43 
    38     for td in itemtds:
    44     for td in itemtds:
    39     # itemlinks = find("a", href=True)
    45     # itemlinks = find("a", href=True)
    40         # print(td.div.a.get('href'))
    46         # print(td.div.a.get('href'))
    41         detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href')
    47         detail_url = "http://www.spitzer.caltech.edu" + td.div.a.get('href')
    42         detail_content = requests.get(detail_url).content
    48         detail_content = requests.get(detail_url).content
    55             continue
    61             continue
    56         if not os.path.isdir('scrapSpitzer/' + img_id):
    62         if not os.path.isdir('scrapSpitzer/' + img_id):
    57             os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
    63             os.makedirs('scrapSpitzer/' + img_id, exist_ok=True)
    58         img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
    64         img_path = 'scrapSpitzer/{0}/{0}.jpg'.format(img_id)
    59         json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
    65         json_path = 'scrapSpitzer/{0}/{0}.json'.format(img_id)
       
    66         if os.path.isfile(img_path):
       
    67             print("--> file %s exists from url %s" % (img_path,img_url))
       
    68             continue
    60         request.urlretrieve(img_url, img_path)
    69         request.urlretrieve(img_url, img_path)
    61         avm = AVM.from_image(img_path)
    70         avm = AVM.from_image(img_path)
    62         img_data = convert_avm_container(avm)
    71         img_data = convert_avm_container(avm)
    63 
    72 
    64         img_json = { 
    73         img_json = { 
    76             'url': img_data.get('ReferenceURL')
    85             'url': img_data.get('ReferenceURL')
    77         }
    86         }
    78 
    87 
    79         with open(json_path, 'w') as outfile:
    88         with open(json_path, 'w') as outfile:
    80             json.dump(img_json, outfile)
    89             json.dump(img_json, outfile)
    81     break
       
    82 
    90 
    83 
    91 
    84 
       
    85 
       
    86 # avm = AVM.from_image('eso1238a.jpg')
       
    87 # print(avm)
       
    88 
    92 
    89 '''
    93 '''
    90 
    94 
    91 MetadataVersion: b'1.2'
    95 MetadataVersion: b'1.2'
    92 Creator: b'Spitzer Space Telescope'
    96 Creator: b'Spitzer Space Telescope'