data/script/scrapeso.py
changeset 3 16fb4f5efa69
child 6 7e15a917d940
equal deleted inserted replaced
2:d3fe1866eb5b 3:16fb4f5efa69
       
     1 import json
       
     2 import os
       
     3 import sys, getopt
       
     4 import demjson
       
     5 from bs4 import BeautifulSoup
       
     6 import urllib.request
       
     7 import requests
       
     8 
       
     9 def main(argv):
       
    10     '''
       
    11     please put image lot url
       
    12     (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg
       
    13     and directory as second arg
       
    14     '''
       
    15     directory = './scrapESO/'
       
    16     imgloturl = 'https://www.eso.org/public/france/images/viewall/list/'
       
    17     img_url_patterns = [
       
    18         ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg')
       
    19         ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')
       
    20     ]
       
    21     try:
       
    22         opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="])
       
    23     except getopt.GetoptError:
       
    24         print('test.py -d <directory> -u <imagegroupurl>')
       
    25         sys.exit(2)
       
    26     for opt, arg in opts:
       
    27         if opt == '-h':
       
    28             print('''test.py     -d <directory> (default is : "./scrapESO/")
       
    29             -u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''')
       
    30             sys.exit()
       
    31         elif opt in ("-d", "--dir"):
       
    32             directory = arg
       
    33         elif opt in ("-u", "--url"):
       
    34             imgloturl = arg + "list/"
       
    35         elif opt in ("-o", "--original"):
       
    36             img_url_patterns = [
       
    37                 ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'),
       
    38                 ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'),
       
    39                 ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')]
       
    40 
       
    41     if not os.path.exists(directory):
       
    42         os.mkdir(directory)
       
    43     jsonfile = directory + "allfiles.json"
       
    44     json_all = open(jsonfile, "w")
       
    45     i = 1
       
    46     eso = imgloturl + str(i)
       
    47     while requests.head(eso).status_code < 400:
       
    48         page = requests.get(eso)
       
    49         soup = BeautifulSoup(page.text, "html5lib")
       
    50         start = 0
       
    51         for scripts in soup.find_all('script'):
       
    52             if "var images" in scripts.text:
       
    53                 scripttext = scripts.text
       
    54                 break
       
    55         for scriptchar in scripttext:
       
    56             if scriptchar == "[":
       
    57                 break
       
    58             start += 1
       
    59         listjs = scripttext[start:-2]
       
    60         json_all.write(listjs + ",")
       
    61         listdem = demjson.decode(listjs)
       
    62         for j in listdem:
       
    63             infosdict = {}
       
    64             infosdict['image'] = {}
       
    65             infosdict['object'] = {}
       
    66             realurl = "https://www.eso.org" + j['url']
       
    67             page = requests.get(realurl)
       
    68             #print(realurl)
       
    69             soup = BeautifulSoup(page.text, "html5lib")
       
    70             infosimg = soup.find("div", class_="object-info").find_all("table")[0]
       
    71             infosobj = soup.find("div", class_="object-info").find_all("table")[1]
       
    72             infosdict['url'] = realurl
       
    73             #print(realurl)
       
    74             title = soup.find("h1").text
       
    75             infosdict['image']['title'] = title
       
    76             infosdict['image']['credit'] = soup.find("div", class_="credit").text
       
    77             imagesoup = soup.find("div", class_="archive-image archive-image-dark popup")
       
    78             if not imagesoup:
       
    79                 print("Image not found for " + realurl)
       
    80                 continue
       
    81             #print(realurl)
       
    82         #if imagesoup.isnotNone:
       
    83             print(realurl)
       
    84             descriptionps = imagesoup.find_all_next("p")
       
    85             descriptiontext = ""
       
    86             descriptionhtml = ""
       
    87             for descriptionp in descriptionps:
       
    88                 if "credit" in str(descriptionp.previous_element):
       
    89                     break
       
    90                 descriptiontext_p = descriptionp.text.strip()
       
    91                 if descriptiontext_p:
       
    92                     descriptiontext += descriptiontext_p + "\n"
       
    93 
       
    94                 descriptionhtml_p = str(descriptionp).strip()
       
    95                 if descriptionhtml_p != "<p></p>":
       
    96                     descriptionhtml += descriptionhtml_p
       
    97             infosdict['image']['description_text'] = descriptiontext
       
    98             infosdict['image']['description_html'] = descriptionhtml
       
    99             for tableimg in infosimg.find_all("tr"):
       
   100                 if "Id" in tableimg.text:
       
   101                     imgid = tableimg.find_all("td")[1].text
       
   102                     infosdict['image']['id'] = imgid
       
   103                 if "Type" in tableimg.text:
       
   104                     infosdict['image']['type'] = tableimg.find_all("td")[1].text
       
   105                 if "Date" in tableimg.text or "date" in tableimg.text:
       
   106                     infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text
       
   107             for tableobj in infosobj.find_all("tr"):
       
   108                 if "Name" in tableobj.text or "Nom" in tableobj.text:
       
   109                     infosdict['object']['name'] = tableobj.find_all("td")[1].text
       
   110                 if "Type" in tableobj.text:
       
   111                     infosdict['object']['type'] = tableobj.find_all("td")[1].text
       
   112             imgurl = None
       
   113             img = None
       
   114             for imgurl_pattern, img_pattern in img_url_patterns:
       
   115                 imgurl_test = imgurl_pattern.format(imgid=imgid)
       
   116                 if requests.head(imgurl_test).status_code == 200:
       
   117                     imgurl = imgurl_test
       
   118                     img = img_pattern.format(imgid=imgid)
       
   119                     break
       
   120             if imgurl is None:
       
   121                 continue
       
   122 
       
   123             infosdict['image']['imgurl'] = imgurl
       
   124             imgdirectory = directory + imgid
       
   125         #if not os.path.exists(imgdirectory):
       
   126             os.mkdir(imgdirectory)
       
   127             imgdirection = imgdirectory + "/" + img
       
   128             urllib.request.urlretrieve(imgurl, imgdirection)
       
   129             jsonfname = imgid + ".json"
       
   130             jsondirection = imgdirectory + "/" + jsonfname
       
   131             json_img_file = open(jsondirection, "w")
       
   132             json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': ')))
       
   133             json_img_file.close()
       
   134             print(realurl + " : OK")
       
   135         i += 1
       
   136         eso = imgloturl + str(i)
       
   137     json_all.close()
       
   138 
       
   139 
       
   140 if __name__ == '__main__':
       
   141     main(sys.argv[1:])