data/script/scrapeso.py
author ymh <ymh.work@gmail.com>
Thu, 28 Jun 2018 15:19:43 +0200
changeset 3 16fb4f5efa69
child 6 7e15a917d940
permissions -rw-r--r--
* add scraping script * correct size management problem on importimages
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
3
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
import json
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import os
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
import sys, getopt
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
import demjson
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
from bs4 import BeautifulSoup
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
import urllib.request
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
import requests
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
def main(argv):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
    '''
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
    please put image lot url
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
    (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
    and directory as second arg
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
    '''
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
    directory = './scrapESO/'
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
    imgloturl = 'https://www.eso.org/public/france/images/viewall/list/'
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
    img_url_patterns = [
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
        ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg')
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
        ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
    ]
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
    try:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
        opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="])
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
    except getopt.GetoptError:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
        print('test.py -d <directory> -u <imagegroupurl>')
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
        sys.exit(2)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
    for opt, arg in opts:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
        if opt == '-h':
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
            print('''test.py     -d <directory> (default is : "./scrapESO/")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
            -u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''')
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
            sys.exit()
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
        elif opt in ("-d", "--dir"):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
            directory = arg
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
        elif opt in ("-u", "--url"):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
            imgloturl = arg + "list/"
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
        elif opt in ("-o", "--original"):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
            img_url_patterns = [
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
                ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'),
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
                ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'),
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
                ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')]
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
    if not os.path.exists(directory):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
        os.mkdir(directory)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
    jsonfile = directory + "allfiles.json"
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
    json_all = open(jsonfile, "w")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
    i = 1
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
    eso = imgloturl + str(i)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
    while requests.head(eso).status_code < 400:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
        page = requests.get(eso)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
        soup = BeautifulSoup(page.text, "html5lib")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
        start = 0
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
        for scripts in soup.find_all('script'):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
            if "var images" in scripts.text:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
                scripttext = scripts.text
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
                break
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
        for scriptchar in scripttext:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
            if scriptchar == "[":
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
                break
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
            start += 1
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
        listjs = scripttext[start:-2]
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
        json_all.write(listjs + ",")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
        listdem = demjson.decode(listjs)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
        for j in listdem:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
            infosdict = {}
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
            infosdict['image'] = {}
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
            infosdict['object'] = {}
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
            realurl = "https://www.eso.org" + j['url']
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
            page = requests.get(realurl)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
            #print(realurl)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
            soup = BeautifulSoup(page.text, "html5lib")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
            infosimg = soup.find("div", class_="object-info").find_all("table")[0]
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
            infosobj = soup.find("div", class_="object-info").find_all("table")[1]
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
            infosdict['url'] = realurl
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
            #print(realurl)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
            title = soup.find("h1").text
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
            infosdict['image']['title'] = title
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
            infosdict['image']['credit'] = soup.find("div", class_="credit").text
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
            imagesoup = soup.find("div", class_="archive-image archive-image-dark popup")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
            if not imagesoup:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                print("Image not found for " + realurl)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
                continue
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
            #print(realurl)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
        #if imagesoup.isnotNone:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
            print(realurl)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
            descriptionps = imagesoup.find_all_next("p")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
            descriptiontext = ""
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
            descriptionhtml = ""
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
            for descriptionp in descriptionps:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
                if "credit" in str(descriptionp.previous_element):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
                    break
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
                descriptiontext_p = descriptionp.text.strip()
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
                if descriptiontext_p:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
                    descriptiontext += descriptiontext_p + "\n"
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
                descriptionhtml_p = str(descriptionp).strip()
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
                if descriptionhtml_p != "<p></p>":
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
                    descriptionhtml += descriptionhtml_p
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
            infosdict['image']['description_text'] = descriptiontext
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
            infosdict['image']['description_html'] = descriptionhtml
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
            for tableimg in infosimg.find_all("tr"):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
                if "Id" in tableimg.text:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
                    imgid = tableimg.find_all("td")[1].text
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
                    infosdict['image']['id'] = imgid
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
                if "Type" in tableimg.text:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
                    infosdict['image']['type'] = tableimg.find_all("td")[1].text
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
                if "Date" in tableimg.text or "date" in tableimg.text:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
                    infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
            for tableobj in infosobj.find_all("tr"):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
                if "Name" in tableobj.text or "Nom" in tableobj.text:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
                    infosdict['object']['name'] = tableobj.find_all("td")[1].text
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
                if "Type" in tableobj.text:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
                    infosdict['object']['type'] = tableobj.find_all("td")[1].text
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
            imgurl = None
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
            img = None
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
            for imgurl_pattern, img_pattern in img_url_patterns:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
                imgurl_test = imgurl_pattern.format(imgid=imgid)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
                if requests.head(imgurl_test).status_code == 200:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
                    imgurl = imgurl_test
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
                    img = img_pattern.format(imgid=imgid)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
                    break
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
            if imgurl is None:
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
                continue
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
            infosdict['image']['imgurl'] = imgurl
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
            imgdirectory = directory + imgid
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
        #if not os.path.exists(imgdirectory):
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
            os.mkdir(imgdirectory)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
            imgdirection = imgdirectory + "/" + img
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
            urllib.request.urlretrieve(imgurl, imgdirection)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
            jsonfname = imgid + ".json"
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
            jsondirection = imgdirectory + "/" + jsonfname
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
            json_img_file = open(jsondirection, "w")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
            json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': ')))
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
            json_img_file.close()
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
            print(realurl + " : OK")
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
        i += 1
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
        eso = imgloturl + str(i)
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
    json_all.close()
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
if __name__ == '__main__':
16fb4f5efa69 * add scraping script
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
    main(sys.argv[1:])