爬虫批量下载图片并清理
文章目录
说明
本文只提供了下载图片并清理的代码块函数,没有添加调用示例。
Python爬虫下载图片
# 获取图片url连接def getParsePage(pn,name,save_path):'''parameter:pn : 页数(每页60张)name : 需要下载图片的关键字save_path : 下载图片的保存路径''' for i in range(int(pn)): # 获取网页 print('Scaning for {} page···'.format(i+1), end=' ') # 百度图片首页的url # name 关键词 # pn 是页数 url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d' %(name,i*20) response = requests.get(url, headers=headers) html = response.content.decode() # 正则表达式解析网页 url_list = re.findall('"objURL":"(.*?)",', html) # 返回一个列表 print('Done') # 根据获取到的图片链接,把图片保存到本地 downloadImg(url_list=url_list, save_path=save_path, page=i+1)def downloadImg(url_list, save_path, page): # 下载图片'''parameter:url_list : 图片的urlsave_path : 下载的图片保存地址page : 页数''' try: os.mkdir(save_path) # 创建文件夹 except: pass tem = 1 print('Page:{} Downloading {} images'.format(page, len(url_list))) for url in tqdm(url_list, desc='Downloading ', ncols=80, unit='img'): try: img = requests.get(url=url,timeout=10,headers=headers) time.sleep(1) img_name = save_path + '/{}-{}.jpg'.format(page, tem) with open(img_name, 'wb') as f: f.write(img.content) # print('Done') except: pass # print('Error') tem += 1 print('Page:{} Done\n'.format(page))
清理无效图片
def imgCleaning(save_path, minsize):'''parameter:save_path : 图片保存的地址minsize : 标识为无效图片的最小大小(B)''' print('Delete invalid images\nScaning···',end=' ') file_list = [os.path.join(save_path, file) for file in os.listdir(save_path)] invalid_list = [] # 删除无效图片 for file in file_list: size = os.path.getsize(file) if size < minsize: invalid_list.append(file) print('Done\nTotal number of invalid images: {}'.format(len(invalid_list))) # i = 1 for inv_file in tqdm(invalid_list, desc='Deleting ', ncols=80, unit='img'): try: os.remove(inv_file) time.sleep(0.1) # print('Done') except: pass # print('Error') # i += 1 print('Clean {} images Done\n'.format(len(invalid_list)))
清理相似图片
def hashDetect(file_path):'''parameter:file_path : 图片保存地址''' highfreq_factor = 4 # resize尺度 hash_size = 32 # hash值长度 img_scale = 64 # img_size = hash_size * highfreq_factor file_list = [] phash_list = [] ahash_list = [] dhash_list = [] whash_list = [] del_list = [] for file in tqdm(os.listdir(file_path), desc='Scaning ', ncols=80, unit='img'): if os.path.splitext(file)[1] == '.jpg': path_file = os.path.join(file_path, file) # print(path_file) try: phash = imagehash.phash(Image.open(path_file), hash_size=hash_size, highfreq_factor=highfreq_factor) # 感知哈希 ahash = imagehash.average_hash(Image.open(path_file), hash_size=hash_size) # 平均散列哈希 dhash = imagehash.dhash(Image.open(path_file), hash_size=hash_size) # 梯度散列哈希 whash = imagehash.whash(Image.open(path_file), hash_size=hash_size, image_scale=img_scale, mode='db4') # 离散小波变换 phash_list.append(phash) ahash_list.append(ahash) dhash_list.append(dhash) whash_list.append(whash) file_list.append(path_file) except: del_list.append(path_file) file_list_len = len(file_list) for i in tqdm(range(file_list_len), desc='Calculating ', ncols=80, unit='img'): if i >= file_list_len - 1: continue for j in range(i + 1, file_list_len): if j >= file_list_len - 1 or i >= file_list_len - 1: break phash_value = 1 - (phash_list[i] - phash_list[j]) / len(phash_list[i].hash) 2 ahash_value = 1 - (ahash_list[i] - ahash_list[j]) / len(ahash_list[i].hash) 2 dhash_value = 1 - (dhash_list[i] - dhash_list[j]) / len(dhash_list[i].hash) 2 whash_value = 1 - (whash_list[i] - whash_list[j]) / len(whash_list[i].hash) 2 hash_value = max(phash_value, ahash_value, dhash_value, whash_value) if(hash_value > 0.85): del_list.append(file_list.pop(j)) phash_list.pop(j) ahash_list.pop(j) dhash_list.pop(j) whash_list.pop(j) file_list_len -= 1 j -= 1 time.sleep(1) # print('Done') print('Find total number of same images: {}\n'.format(len(del_list))) time.sleep(1) return del_listdef imgDeepClean(file_path):'''parameter:file_path : 图片保存地址''' # print('Scaning···', end='') print('Delete same images') clean_file_list = hashDetect(file_path) if len(clean_file_list) == 0: print('None images') else: i = 1 print('Delete {} images'.format(len(clean_file_list))) for inv_file in tqdm(clean_file_list, desc='Deleting ', ncols=80,unit='img'): try: os.remove(inv_file) time.sleep(0.2) # print('Done') except: pass # print('Error') i += 1 print('Delete {} images Done\n'.format(len(clean_file_list)))