# HG changeset patch # User ymh # Date 1530204620 -7200 # Node ID a40fd399085090673762dec720904a433214be83 # Parent 7e15a917d94031ddec33048ef5c1d292335c1d4c skip already processed images diff -r 7e15a917d940 -r a40fd3990850 data/script/scrapeso.py --- a/data/script/scrapeso.py Thu Jun 28 17:26:15 2018 +0200 +++ b/data/script/scrapeso.py Thu Jun 28 18:50:20 2018 +0200 @@ -64,6 +64,13 @@ infosdict['image'] = {} infosdict['object'] = {} realurl = "https://www.eso.org" + j['url'] + + tempo_imgid = realurl.strip('/').split('/')[-1] + tempo_imgdirectory = directory + tempo_imgid + if os.path.isdir(tempo_imgdirectory) and os.path.isfile(os.path.join(tempo_imgdirectory, tempo_imgid+".json")): + print("Image alredy processed skipping %s" % realurl) + continue + page = requests.get(realurl) #print(realurl) soup = BeautifulSoup(page.text, "html5lib")