Chia sẻ với anh em mã nguồn Python download ảnh sử dụng đa tiến trình. Tốc độ download ảnh cực ngon.
Mã nguồn này được cải tiến mã nguồn của 1 anh Tây ở đây: https://gist.github.com/chandlerprall/1017266.
Nội dung cải tiến: Nâng cấp dùng cho Python 3, Cho thiết lập tên, đường dẫn file tải về
Table of Contents
Mã nguồn Python download ảnh
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
class ThreadedDownload(object): ''' Download images with multi thread How to use downloader = ThreadedDownload(urls, ".", True, 10, 2) # urls = list[imageUrl, downloadFilePath] print 'Downloading %s files' % len(urls) downloader.run() # print report print 'Downloaded %(success)s of %(total)s' % {'success': len(downloader.report['success']), 'total': len(urls)} if len(downloader.report['failure']) > 0: print '\nFailed urls:' for url in downloader.report['failure']: print url ''' REGEX = { 'hostname_strip':re.compile('.*\..*?/', re.I) } class MissingDirectoryException(Exception): pass class Downloader(threading.Thread): def __init__(self, queue, report): threading.Thread.__init__(self) self.queue = queue self.report = report def run(self): while self.queue.empty() == False: url = self.queue.get() response = url.download() if response == False and url.url_tried < url.url_tries: self.queue.put(url) elif response == False and url.url_tried == url.url_tries: self.report['failure'].append(url) elif response == True: self.report['success'].append(url) self.queue.task_done() class URLTarget(object): def __init__(self, url, destination, url_tries): self.url = url self.destination = destination self.url_tries = url_tries self.url_tried = 0 self.success = False self.error = None def download(self): self.url_tried = self.url_tried + 1 try: if os.path.exists(self.destination): # This file has already been downloaded self.success = True return self.success remote_file = urlopen(self.url) package = remote_file.read() remote_file.close() if os.path.exists(os.path.dirname(self.destination)) == False: os.makedirs(os.path.dirname(self.destination)) dest_file = open(self.destination, 'wb') dest_file.write(package) dest_file.close() self.success = True except Exception as e: self.error = e return self.success def __str__(self): return 'URLTarget (%(url)s, %(success)s, %(error)s)' % {'url':self.url, 'success':self.success, 'error':self.error} def __init__(self, urls=[], destination='.', directory_structure=False, thread_count=5, url_tries=3): if os.path.exists(destination) == False: raise ThreadedDownload.MissingDirectoryException('Destination folder does not exist.') self.queue = Queue(maxsize=0) # Infinite sized queue self.report = {'success':[],'failure':[]} self.threads = [] if destination[-1] != os.path.sep: destination = destination + os.path.sep self.destination = destination self.thread_count = thread_count self.directory_structure = directory_structure # Prepopulate queue with any values we were given for url in urls: self.queue.put(ThreadedDownload.URLTarget(url[0], url[1], url_tries)) def fileDestination(self, url): if self.directory_structure == False: # No directory structure, just filenames file_destination = '%s%s' % (self.destination, os.path.basename(url)) elif self.directory_structure == True: # Strip off hostname, keep all other directories file_destination = '%s%s' % (self.destination, ThreadedDownload.REGEX['hostname_strip'].sub('', url)) elif hasattr(self.directory_structure, '__len__') and len(self.directory_structure) == 2: # User supplied a custom regex replace regex = self.directory_structure[0] if isinstance(regex, str): regex = re.compile(str) replace = self.directory_structure[1] file_destination = '%s%s' % (self.destination, regex.sub(replace, url)) else: # No idea what's wanted file_destination = None if hasattr(file_destination, 'replace'): file_destination = file_destination.replace('/', os.path.sep) return file_destination def addTarget(self, url, url_tries=3): self.queue.put(ThreadedDownload.URLTarget(url, self.fileDestination(url), url_tries)) def run(self): for i in range(self.thread_count): thread = ThreadedDownload.Downloader(self.queue, self.report) thread.start() self.threads.append(thread) if self.queue.qsize() > 0: self.queue.join() |
Cách sử dụng mã nguồn download ảnh đa tiến trình
1 2 3 4 5 6 7 8 9 10 11 12 13 |
def do_download(urls): # Do donwload images # add list of download needed downloader = ThreadedDownload(urls, ".", True, 10, 2) print ('Downloading {} files'.format(len(urls))) downloader.run() # print report print("Downloaded {} success of {} total".format(len(downloader.report['success']),len(urls))) if len(downloader.report['failure']) > 0: print ('\nFailed urls:') for url in downloader.report['failure']: print(url) |
- urls: là list đường dẫn ảnh và đường dẫn lưu ảnh ex: DownLoadList.append([image_url, image_file_path])
Tham khảo bài viết khác
Mời anh em dùng thử. Hãy comment chia sẻ cùng bạn bè nếu thấy hữu ích nhé !
Mình là một lập trình viên tự do với hơn 10 năm kinh nghiệm. Mình chuyên về Web scraping, Web automation, Python, Django
Free Sex Dating https://bit.ly/3hhve2N