diff options
author | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
---|---|---|
committer | Matt A. Tobin <mattatobin@localhost.localdomain> | 2018-02-02 04:16:08 -0500 |
commit | 5f8de423f190bbb79a62f804151bc24824fa32d8 (patch) | |
tree | 10027f336435511475e392454359edea8e25895d /python/futures/crawl.py | |
parent | 49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff) | |
download | UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip |
Add m-esr52 at 52.6.0
Diffstat (limited to 'python/futures/crawl.py')
-rw-r--r-- | python/futures/crawl.py | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/python/futures/crawl.py b/python/futures/crawl.py new file mode 100644 index 000000000..86e0af7fe --- /dev/null +++ b/python/futures/crawl.py @@ -0,0 +1,74 @@ +"""Compare the speed of downloading URLs sequentially vs. using futures.""" + +import functools +import time +import timeit +import sys + +try: + from urllib2 import urlopen +except ImportError: + from urllib.request import urlopen + +from concurrent.futures import (as_completed, ThreadPoolExecutor, + ProcessPoolExecutor) + +URLS = ['http://www.google.com/', + 'http://www.apple.com/', + 'http://www.ibm.com', + 'http://www.thisurlprobablydoesnotexist.com', + 'http://www.slashdot.org/', + 'http://www.python.org/', + 'http://www.bing.com/', + 'http://www.facebook.com/', + 'http://www.yahoo.com/', + 'http://www.youtube.com/', + 'http://www.blogger.com/'] + +def load_url(url, timeout): + kwargs = {'timeout': timeout} if sys.version_info >= (2, 6) else {} + return urlopen(url, **kwargs).read() + +def download_urls_sequential(urls, timeout=60): + url_to_content = {} + for url in urls: + try: + url_to_content[url] = load_url(url, timeout=timeout) + except: + pass + return url_to_content + +def download_urls_with_executor(urls, executor, timeout=60): + try: + url_to_content = {} + future_to_url = dict((executor.submit(load_url, url, timeout), url) + for url in urls) + + for future in as_completed(future_to_url): + try: + url_to_content[future_to_url[future]] = future.result() + except: + pass + return url_to_content + finally: + executor.shutdown() + +def main(): + for name, fn in [('sequential', + functools.partial(download_urls_sequential, URLS)), + ('processes', + functools.partial(download_urls_with_executor, + URLS, + ProcessPoolExecutor(10))), + ('threads', + functools.partial(download_urls_with_executor, + URLS, + ThreadPoolExecutor(10)))]: + sys.stdout.write('%s: ' % name.ljust(12)) + start = time.time() + url_map = fn() + sys.stdout.write('%.2f seconds (%d of %d downloaded)\n' % + (time.time() - start, len(url_map), len(URLS))) + +if __name__ == '__main__': + main() |