From 5f8de423f190bbb79a62f804151bc24824fa32d8 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Fri, 2 Feb 2018 04:16:08 -0500 Subject: Add m-esr52 at 52.6.0 --- python/futures/crawl.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 python/futures/crawl.py (limited to 'python/futures/crawl.py') diff --git a/python/futures/crawl.py b/python/futures/crawl.py new file mode 100644 index 000000000..86e0af7fe --- /dev/null +++ b/python/futures/crawl.py @@ -0,0 +1,74 @@ +"""Compare the speed of downloading URLs sequentially vs. using futures.""" + +import functools +import time +import timeit +import sys + +try: + from urllib2 import urlopen +except ImportError: + from urllib.request import urlopen + +from concurrent.futures import (as_completed, ThreadPoolExecutor, + ProcessPoolExecutor) + +URLS = ['http://www.google.com/', + 'http://www.apple.com/', + 'http://www.ibm.com', + 'http://www.thisurlprobablydoesnotexist.com', + 'http://www.slashdot.org/', + 'http://www.python.org/', + 'http://www.bing.com/', + 'http://www.facebook.com/', + 'http://www.yahoo.com/', + 'http://www.youtube.com/', + 'http://www.blogger.com/'] + +def load_url(url, timeout): + kwargs = {'timeout': timeout} if sys.version_info >= (2, 6) else {} + return urlopen(url, **kwargs).read() + +def download_urls_sequential(urls, timeout=60): + url_to_content = {} + for url in urls: + try: + url_to_content[url] = load_url(url, timeout=timeout) + except: + pass + return url_to_content + +def download_urls_with_executor(urls, executor, timeout=60): + try: + url_to_content = {} + future_to_url = dict((executor.submit(load_url, url, timeout), url) + for url in urls) + + for future in as_completed(future_to_url): + try: + url_to_content[future_to_url[future]] = future.result() + except: + pass + return url_to_content + finally: + executor.shutdown() + +def main(): + for name, fn in [('sequential', + functools.partial(download_urls_sequential, URLS)), + ('processes', + functools.partial(download_urls_with_executor, + URLS, + ProcessPoolExecutor(10))), + ('threads', + functools.partial(download_urls_with_executor, + URLS, + ThreadPoolExecutor(10)))]: + sys.stdout.write('%s: ' % name.ljust(12)) + start = time.time() + url_map = fn() + sys.stdout.write('%.2f seconds (%d of %d downloaded)\n' % + (time.time() - start, len(url_map), len(URLS))) + +if __name__ == '__main__': + main() -- cgit v1.2.3