# HG changeset patch
# User ymh <ymh.work@gmail.com>
# Date 1530204620 -7200
# Node ID a40fd399085090673762dec720904a433214be83
# Parent  7e15a917d94031ddec33048ef5c1d292335c1d4c
skip already processed images

diff -r 7e15a917d940 -r a40fd3990850 data/script/scrapeso.py
--- a/data/script/scrapeso.py	Thu Jun 28 17:26:15 2018 +0200
+++ b/data/script/scrapeso.py	Thu Jun 28 18:50:20 2018 +0200
@@ -64,6 +64,13 @@
             infosdict['image'] = {}
             infosdict['object'] = {}
             realurl = "https://www.eso.org" + j['url']
+
+            tempo_imgid = realurl.strip('/').split('/')[-1]
+            tempo_imgdirectory = directory + tempo_imgid
+            if os.path.isdir(tempo_imgdirectory) and os.path.isfile(os.path.join(tempo_imgdirectory, tempo_imgid+".json")):
+                print("Image alredy processed skipping %s" % realurl)
+                continue
+
             page = requests.get(realurl)
             #print(realurl)
             soup = BeautifulSoup(page.text, "html5lib")