data/script/scrapeso.py
changeset 7 a40fd3990850
parent 6 7e15a917d940
child 8 5459fff1356b
equal deleted inserted replaced
6:7e15a917d940 7:a40fd3990850
    62         for j in listdem:
    62         for j in listdem:
    63             infosdict = {}
    63             infosdict = {}
    64             infosdict['image'] = {}
    64             infosdict['image'] = {}
    65             infosdict['object'] = {}
    65             infosdict['object'] = {}
    66             realurl = "https://www.eso.org" + j['url']
    66             realurl = "https://www.eso.org" + j['url']
       
    67 
       
    68             tempo_imgid = realurl.strip('/').split('/')[-1]
       
    69             tempo_imgdirectory = directory + tempo_imgid
       
    70             if os.path.isdir(tempo_imgdirectory) and os.path.isfile(os.path.join(tempo_imgdirectory, tempo_imgid+".json")):
       
    71                 print("Image alredy processed skipping %s" % realurl)
       
    72                 continue
       
    73 
    67             page = requests.get(realurl)
    74             page = requests.get(realurl)
    68             #print(realurl)
    75             #print(realurl)
    69             soup = BeautifulSoup(page.text, "html5lib")
    76             soup = BeautifulSoup(page.text, "html5lib")
    70             infosimg = soup.find("div", class_="object-info").find_all("table")[0]
    77             infosimg = soup.find("div", class_="object-info").find_all("table")[0]
    71             infosobj = soup.find("div", class_="object-info").find_all("table")[1]
    78             infosobj = soup.find("div", class_="object-info").find_all("table")[1]