Chia sẻ mã nguồn Python download ảnh

Chia sẻ với anh em mã nguồn Python download ảnh sử dụng đa tiến trình. Tốc độ download ảnh cực ngon.

Mã nguồn này được cải tiến mã nguồn của 1 anh Tây ở đây: https://gist.github.com/chandlerprall/1017266.

Nội dung cải tiến: Nâng cấp dùng cho Python 3, Cho thiết lập tên, đường dẫn file tải về

Table of Contents

Mã nguồn Python download ảnh

class ThreadedDownload(object):
    '''
        Download images with multi thread
        How to use

        downloader = ThreadedDownload(urls, ".", True, 10, 2)
        # urls = list[imageUrl, downloadFilePath]
        print 'Downloading %s files' % len(urls)
        downloader.run()
        # print report
        print 'Downloaded %(success)s of %(total)s' % {'success': len(downloader.report['success']), 'total': len(urls)}
        
        if len(downloader.report['failure']) > 0:
            print '\nFailed urls:'
            for url in downloader.report['failure']:
                print url
    '''
    REGEX = {
        'hostname_strip':re.compile('.*\..*?/', re.I)
    }
    
    
    class MissingDirectoryException(Exception):
        pass
    
        
    class Downloader(threading.Thread):
        def __init__(self, queue, report):
            threading.Thread.__init__(self)
            self.queue = queue
            self.report = report
        
        def run(self):
            while self.queue.empty() == False:
                url = self.queue.get()
                
                response = url.download()
                if response == False and url.url_tried < url.url_tries:
                    self.queue.put(url)
                elif response == False and url.url_tried == url.url_tries:
                    self.report['failure'].append(url)
                elif response == True:
                    self.report['success'].append(url)
                
                self.queue.task_done()
    
    
    class URLTarget(object):
        def __init__(self, url, destination, url_tries):
            self.url = url
            self.destination = destination
            self.url_tries = url_tries
            self.url_tried = 0
            self.success = False
            self.error = None
        
        def download(self):
            self.url_tried = self.url_tried + 1
            
            try:
                if os.path.exists(self.destination): # This file has already been downloaded
                    self.success = True
                    return self.success
                
                remote_file = urlopen(self.url)
                package = remote_file.read()
                remote_file.close()
                
                if os.path.exists(os.path.dirname(self.destination)) == False:
                    os.makedirs(os.path.dirname(self.destination))
                
                dest_file = open(self.destination, 'wb')
                dest_file.write(package)
                dest_file.close()
                
                self.success = True
                
            except Exception as e:
                self.error = e
                
            return self.success
        
        def __str__(self):
            return 'URLTarget (%(url)s, %(success)s, %(error)s)' % {'url':self.url, 'success':self.success, 'error':self.error}
    
    
    def __init__(self, urls=[], destination='.', directory_structure=False, thread_count=5, url_tries=3):
        if os.path.exists(destination) == False:
            raise ThreadedDownload.MissingDirectoryException('Destination folder does not exist.')
        
        self.queue = Queue(maxsize=0) # Infinite sized queue
        self.report = {'success':[],'failure':[]}
        self.threads = []
        
        if destination[-1] != os.path.sep:
            destination = destination + os.path.sep
        self.destination = destination
        self.thread_count = thread_count
        self.directory_structure = directory_structure
        
        # Prepopulate queue with any values we were given
        for url in urls:
            self.queue.put(ThreadedDownload.URLTarget(url[0], url[1], url_tries))
    
    
    def fileDestination(self, url):
        if self.directory_structure == False:
            # No directory structure, just filenames
            file_destination = '%s%s' % (self.destination, os.path.basename(url))
        
        elif self.directory_structure == True:
            # Strip off hostname, keep all other directories
            file_destination =  '%s%s' % (self.destination, ThreadedDownload.REGEX['hostname_strip'].sub('', url))
        
        elif hasattr(self.directory_structure, '__len__') and len(self.directory_structure) == 2:
            # User supplied a custom regex replace
            regex = self.directory_structure[0]
            if isinstance(regex, str):
                regex = re.compile(str)
            replace = self.directory_structure[1]
            file_destination =  '%s%s' % (self.destination, regex.sub(replace, url))
        
        else:
            # No idea what's wanted
            file_destination = None
        
        if hasattr(file_destination, 'replace'):
            file_destination = file_destination.replace('/', os.path.sep)
        return file_destination
    
    
    def addTarget(self, url, url_tries=3):
        self.queue.put(ThreadedDownload.URLTarget(url, self.fileDestination(url), url_tries))
    
    
    def run(self):
        for i in range(self.thread_count):
            thread = ThreadedDownload.Downloader(self.queue, self.report)
            thread.start()
            self.threads.append(thread)
        if self.queue.qsize() > 0:
            self.queue.join()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

class ThreadedDownload(object):

'''

Download images with multi thread

How to use

downloader = ThreadedDownload(urls, ".", True, 10, 2)

# urls = list[imageUrl, downloadFilePath]

print 'Downloading %s files' % len(urls)

downloader.run()

# print report

print 'Downloaded %(success)s of %(total)s' % {'success': len(downloader.report['success']), 'total': len(urls)}

if len(downloader.report['failure']) > 0:

print '\nFailed urls:'

for url in downloader.report['failure']:

print url

'''

REGEX = {

'hostname_strip':re.compile('.*\..*?/', re.I)

}

class MissingDirectoryException(Exception):

pass

class Downloader(threading.Thread):

def __init__(self, queue, report):

threading.Thread.__init__(self)

self.queue = queue

self.report = report

def run(self):

while self.queue.empty() == False:

url = self.queue.get()

response = url.download()

if response == False and url.url_tried < url.url_tries:

self.queue.put(url)

elif response == False and url.url_tried == url.url_tries:

self.report['failure'].append(url)

elif response == True:

self.report['success'].append(url)

self.queue.task_done()

class URLTarget(object):

def __init__(self, url, destination, url_tries):

self.url = url

self.destination = destination

self.url_tries = url_tries

self.url_tried = 0

self.success = False

self.error = None

def download(self):

self.url_tried = self.url_tried + 1

try:

if os.path.exists(self.destination): # This file has already been downloaded

self.success = True

return self.success

remote_file = urlopen(self.url)

package = remote_file.read()

remote_file.close()

if os.path.exists(os.path.dirname(self.destination)) == False:

os.makedirs(os.path.dirname(self.destination))

dest_file = open(self.destination, 'wb')

dest_file.write(package)

dest_file.close()

self.success = True

except Exception as e:

self.error = e

return self.success

def __str__(self):

return 'URLTarget (%(url)s, %(success)s, %(error)s)' % {'url':self.url, 'success':self.success, 'error':self.error}

def __init__(self, urls=[], destination='.', directory_structure=False, thread_count=5, url_tries=3):

if os.path.exists(destination) == False:

raise ThreadedDownload.MissingDirectoryException('Destination folder does not exist.')

self.queue = Queue(maxsize=0) # Infinite sized queue

self.report = {'success':[],'failure':[]}

self.threads = []

if destination[-1] != os.path.sep:

destination = destination + os.path.sep

self.destination = destination

self.thread_count = thread_count

self.directory_structure = directory_structure

# Prepopulate queue with any values we were given

for url in urls:

self.queue.put(ThreadedDownload.URLTarget(url[0], url[1], url_tries))

def fileDestination(self, url):

if self.directory_structure == False:

# No directory structure, just filenames

file_destination = '%s%s' % (self.destination, os.path.basename(url))

elif self.directory_structure == True:

# Strip off hostname, keep all other directories

file_destination = '%s%s' % (self.destination, ThreadedDownload.REGEX['hostname_strip'].sub('', url))

elif hasattr(self.directory_structure, '__len__') and len(self.directory_structure) == 2:

# User supplied a custom regex replace

regex = self.directory_structure[0]

if isinstance(regex, str):

regex = re.compile(str)

replace = self.directory_structure[1]

file_destination = '%s%s' % (self.destination, regex.sub(replace, url))

else:

# No idea what's wanted

file_destination = None

if hasattr(file_destination, 'replace'):

file_destination = file_destination.replace('/', os.path.sep)

return file_destination

def addTarget(self, url, url_tries=3):

self.queue.put(ThreadedDownload.URLTarget(url, self.fileDestination(url), url_tries))

def run(self):

for i in range(self.thread_count):

thread = ThreadedDownload.Downloader(self.queue, self.report)

thread.start()

self.threads.append(thread)

if self.queue.qsize() > 0:

self.queue.join()

Cách sử dụng mã nguồn download ảnh đa tiến trình

def do_download(urls):
	# Do donwload images
	# add list of download needed
	downloader = ThreadedDownload(urls, ".", True, 10, 2)
	print ('Downloading {} files'.format(len(urls)))
	downloader.run()
	# print report
	print("Downloaded {} success of {} total".format(len(downloader.report['success']),len(urls)))
	
	if len(downloader.report['failure']) > 0:
		print ('\nFailed urls:')
		for url in downloader.report['failure']:
			print(url)

def do_download(urls):

# Do donwload images

# add list of download needed

downloader = ThreadedDownload(urls, ".", True, 10, 2)

print ('Downloading {} files'.format(len(urls)))

downloader.run()

# print report

print("Downloaded {} success of {} total".format(len(downloader.report['success']),len(urls)))

if len(downloader.report['failure']) > 0:

print ('\nFailed urls:')

for url in downloader.report['failure']:

print(url)

Trong đó:

urls: là list đường dẫn ảnh và đường dẫn lưu ảnh ex: DownLoadList.append([image_url, image_file_path])

Tham khảo bài viết khác

Mời anh em dùng thử. Hãy comment chia sẻ cùng bạn bè nếu thấy hữu ích nhé !

etuannv

Mình là một lập trình viên tự do với hơn 10 năm kinh nghiệm. Mình chuyên về Web scraping, Web automation, Python, Django

Chia sẻ mã nguồn Python download ảnh

Mã nguồn Python download ảnh

Cách sử dụng mã nguồn download ảnh đa tiến trình

Tham khảo bài viết khác

Bài viết khác

One Reply to “Chia sẻ mã nguồn Python download ảnh”

Trả lời Hủy