Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions src/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def check_pos_int(val: int):
else:
raise ValueError


def get_arguments(argv=sys.argv):
"""
The cli front end for the scraper.
Expand All @@ -40,7 +39,7 @@ def get_arguments(argv=sys.argv):
Returns:
parser.parse_args() -- A struct with all required info to run the scraper
"""
parser = argparse.ArgumentParser(description="Scrape google for images")
parser = argparse.ArgumentParser(description="Scrape Google for images")
parser.add_argument("keyword",
help="the phrase used to find images",
type=str,
Expand All @@ -59,6 +58,46 @@ def get_arguments(argv=sys.argv):
type=check_pos_int,
nargs="?",
default=1)
parser.add_argument("-s", "--size",
help="Restrict your search to a certain size of image.",
type=str,
nargs="?",
choices=['large','medium','icon', '400x300', '640x480', '800x600', '1024x768', '2mp', '4mp', '8mp', '10mp', '12mp', '15mp', '20mp', '40mp', '70mp'])
parser.add_argument("-a", "--aspectratio",
help="Restrict to specific aspect ratios.",
type=str,
nargs="?",
choices=['tall', 'square', 'wide', 'panoramic'])
parser.add_argument("-i", "--color",
help="Search for a certain color of image.",
type=str,
nargs="?",
choices=['color', 'grayscale', 'transparent', 'red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown'])
parser.add_argument("-k", "--type",
help="The type of image to search for.",
type=str,
nargs="?",
choices=['face', 'photo', 'clipart', 'lineart', 'animated'],
dest="imgtype")
parser.add_argument("-r", "--region",
help="Get results from a specific region.",
type=str,
nargs="?")
parser.add_argument("-f", "--filetype",
help="Search for a specific file extension.",
type=str,
nargs="?",
choices=['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico', 'raw'])
parser.add_argument("-u", "--usage",
help="Specify usage rights.",
type=str,
nargs="?",
choices=['cc', 'other'])
parser.add_argument("-p", "--safesearch",
help="Force the use of a specific safesearch setting. Can be 'on' or 'off'.",
type=str,
nargs="?",
choices=['on', 'off'])
args = parser.parse_args(argv[1:])
# Set default directory
if args.directory is None:
Expand Down
2 changes: 1 addition & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def main():
args = get_arguments(sys.argv)
scrape_images(args.keyword[0], args.count, args.directory, args.threads)
scrape_images(args.keyword[0], args.count, args.directory, args.threads, args.size, args.aspectratio, args.color, args.imgtype, args.region, args.filetype, args.usage, args.safesearch)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filters should be passed as a single object so we can try to avoid these long definitions that look ugly. IMO if you can't call the function on one line, it needs to be refactored into a more compact structure


if __name__ == "__main__":
main()
115 changes: 113 additions & 2 deletions src/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,116 @@ def add_filetype(file_path: str):
eprint(err)
return 1


def process_image_size(val: str):
key = 'isz:'
if (val == 'large'):
return key + 'l'
elif (val == 'medium'):
return key + 'm'
elif (val == 'icon'):
return key + 'i'
elif (val in ['400x300', '640x480', '800x600', '1024x768']):
key += 'lt%2Cislt:'
if (val == '400x300'):
return key + "qsvga"
elif (val == '640x480'):
return key + "vga"
elif (val == '800x600'):
return key + "svga"
elif (val == '1024x768'):
return key + "xga"
elif (val in ['2mp','4mp','6mp','8mp','10mp','12mp','15mp','20mp','40mp','70mp']):
return key + 'lt%2Cislt:' + val
else:
return ""

def process_image_aspectratio(val: str):
key = 'iar:'
if (val == 'tall'):
return key + 't'
elif (val == 'square'):
return key + 's'
elif (val == 'wide'):
return key + 'w'
elif (val == 'panoramic'):
return key + 'xw'

def process_image_color(val: str):
if (val == "color"):
return "ic:color"
elif (val == "grayscale"):
return "ic:gray"
elif (val == "transparent"):
return "ic:trans"
elif (val in ['red','orange','yellow','green','teal','blue','purple','pink','white','gray','black','brown']):
return "ic:specific%2Cisc:" + val
else:
return ""

def process_image_type(val: str):
if (val in ['face', 'photo', 'clipart', 'lineart', 'animated']):
return 'itp:' + val
else:
return ""

def process_image_region(val: str):
if (val == ''):
return ''
else:
return 'ctr:country' + val.upper()

def process_image_filetype(val: str):
if (val in ['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico', 'raw']):
return 'ift:' + val

def process_image_usage(val: str):
key = 'sur:'
if (val == 'cc'):
return key + 'cl'
elif (val == 'other'):
return key + 'ol'
else:
return ''

def process_safesearch(val: str):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the default safesearch behavior any different from "off"?

if (val in ["on", "off"]):
return val
else:
return ""


def setup_url(searchurl: str, imgsize: str, imgaspectratio: str, imgcolor: str, imgtype: str, imgregion: str, imgfiletype: str, imgusage: str, safesearch: str):
features = [searchurl]
subfeatures = [[],[]]
if (imgsize != None):
subfeatures[0] += [process_image_size(imgsize)]
if (imgaspectratio != None):
subfeatures[0] += [process_image_aspectratio(imgaspectratio)]
if (imgcolor != None):
subfeatures[0] += [process_image_color(imgcolor)]
if (imgtype != None):
subfeatures[0] += [process_image_type(imgtype)]
if (imgregion != None):
subfeatures[0] += [process_image_region(imgregion)]
if (imgfiletype != None):
subfeatures[0] += [process_image_filetype(imgfiletype)]
if (imgusage != None):
subfeatures[0] += [process_image_usage(imgusage)]
if (safesearch != None):
subfeatures[1] += [process_safesearch(safesearch)]

delim1 = "&"
delim2 = "%2C"

if (subfeatures[0] != []):
features += ["tbs=" + delim2.join(subfeatures[0])]
if (subfeatures[1] != []):
features += ["safe=" + delim2.join(subfeatures[1])]
print(delim1.join(features))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please wrap debugging print statements in an 'if DEBUG' and provide them useful text to identify the output

return delim1.join(features)


############################# scraping helpers ################################

def get_image_urls(query: str, page: int):
Expand Down Expand Up @@ -159,7 +269,7 @@ def get_manifest(search_key: str, image_cnt: int):

################################# main api ####################################

def scrape_images(search_key, image_cnt, directory, threads):
def scrape_images(search_key, image_cnt, directory, threads, size, aspectratio, color, imgtype, region, filetype, usage, safesearch):
"""
Request manifest, generate paths, save files, get filetype.
This is the only function that should be called externally.
Expand All @@ -174,7 +284,8 @@ def scrape_images(search_key, image_cnt, directory, threads):
print("savedir: {}".format(directory))
if not os.path.exists(directory):
os.makedirs(directory)

global search_url
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The global should be called in setup_url so the logic is compartmentalized

search_url = setup_url(search_url, size, aspectratio, color, imgtype, region, filetype, usage, safesearch)
id_url_manifest = get_manifest(search_key, image_cnt)
with ThreadPoolExecutor(max_workers=threads) as pool:
with tqdm(total=len(id_url_manifest)) as progress:
Expand Down