> 文档中心 > 爬虫批量下载图片并清理

爬虫批量下载图片并清理

文章目录


说明

本文只提供了下载图片并清理的代码块函数,没有添加调用示例。

Python爬虫下载图片

# 获取图片url连接def getParsePage(pn,name,save_path):'''parameter:pn : 页数(每页60张)name : 需要下载图片的关键字save_path : 下载图片的保存路径'''    for i in range(int(pn)): # 获取网页 print('Scaning for {} page···'.format(i+1), end=' ') # 百度图片首页的url # name 关键词 # pn 是页数 url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d' %(name,i*20) response = requests.get(url, headers=headers) html = response.content.decode() # 正则表达式解析网页 url_list = re.findall('"objURL":"(.*?)",', html) # 返回一个列表 print('Done') # 根据获取到的图片链接,把图片保存到本地 downloadImg(url_list=url_list, save_path=save_path, page=i+1)def downloadImg(url_list, save_path, page):   # 下载图片'''parameter:url_list : 图片的urlsave_path : 下载的图片保存地址page : 页数'''    try: os.mkdir(save_path) # 创建文件夹    except: pass    tem = 1    print('Page:{} Downloading {} images'.format(page, len(url_list)))    for url in tqdm(url_list, desc='Downloading ', ncols=80, unit='img'):  try:     img = requests.get(url=url,timeout=10,headers=headers)     time.sleep(1)     img_name = save_path + '/{}-{}.jpg'.format(page, tem)     with open(img_name, 'wb') as f:  f.write(img.content)     # print('Done') except:     pass     # print('Error') tem += 1    print('Page:{} Done\n'.format(page))

清理无效图片

def imgCleaning(save_path, minsize):'''parameter:save_path : 图片保存的地址minsize : 标识为无效图片的最小大小(B)'''    print('Delete invalid images\nScaning···',end=' ')    file_list = [os.path.join(save_path, file) for file in os.listdir(save_path)]    invalid_list = []    # 删除无效图片    for file in file_list: size = os.path.getsize(file) if size < minsize:     invalid_list.append(file)    print('Done\nTotal number of invalid images: {}'.format(len(invalid_list))) # i = 1    for inv_file in tqdm(invalid_list, desc='Deleting ', ncols=80, unit='img'): try:     os.remove(inv_file)     time.sleep(0.1)     # print('Done') except:     pass     # print('Error') # i += 1    print('Clean {} images Done\n'.format(len(invalid_list)))

清理相似图片

def hashDetect(file_path):'''parameter:file_path : 图片保存地址'''    highfreq_factor = 4     # resize尺度    hash_size = 32   # hash值长度    img_scale = 64    # img_size = hash_size * highfreq_factor     file_list = []    phash_list = []    ahash_list = []    dhash_list = []    whash_list = []    del_list = []    for file in tqdm(os.listdir(file_path), desc='Scaning ', ncols=80, unit='img'): if os.path.splitext(file)[1] == '.jpg':     path_file = os.path.join(file_path, file)     # print(path_file)     try:  phash = imagehash.phash(Image.open(path_file), hash_size=hash_size, highfreq_factor=highfreq_factor)    # 感知哈希  ahash = imagehash.average_hash(Image.open(path_file), hash_size=hash_size)  # 平均散列哈希  dhash = imagehash.dhash(Image.open(path_file), hash_size=hash_size)  # 梯度散列哈希  whash = imagehash.whash(Image.open(path_file), hash_size=hash_size, image_scale=img_scale, mode='db4')  # 离散小波变换  phash_list.append(phash)  ahash_list.append(ahash)  dhash_list.append(dhash)  whash_list.append(whash)  file_list.append(path_file)     except:  del_list.append(path_file)    file_list_len = len(file_list)    for i in tqdm(range(file_list_len), desc='Calculating ', ncols=80, unit='img'): if i >= file_list_len - 1:     continue for j in range(i + 1, file_list_len):     if j >= file_list_len - 1 or i >= file_list_len - 1:  break     phash_value = 1 - (phash_list[i] - phash_list[j]) / len(phash_list[i].hash)  2     ahash_value = 1 - (ahash_list[i] - ahash_list[j]) / len(ahash_list[i].hash)  2     dhash_value = 1 - (dhash_list[i] - dhash_list[j]) / len(dhash_list[i].hash)  2     whash_value = 1 - (whash_list[i] - whash_list[j]) / len(whash_list[i].hash)  2     hash_value = max(phash_value, ahash_value, dhash_value, whash_value)     if(hash_value > 0.85):  del_list.append(file_list.pop(j))  phash_list.pop(j)  ahash_list.pop(j)  dhash_list.pop(j)  whash_list.pop(j)  file_list_len -= 1  j -= 1 time.sleep(1)    # print('Done')    print('Find total number of same images: {}\n'.format(len(del_list)))    time.sleep(1)    return del_listdef imgDeepClean(file_path):'''parameter:file_path : 图片保存地址'''    # print('Scaning···', end='')    print('Delete same images')    clean_file_list = hashDetect(file_path)    if len(clean_file_list) == 0: print('None images')    else: i = 1 print('Delete {} images'.format(len(clean_file_list))) for inv_file in tqdm(clean_file_list, desc='Deleting ', ncols=80,unit='img'):   try:  os.remove(inv_file)  time.sleep(0.2)  # print('Done')     except:  pass  # print('Error')     i += 1 print('Delete {} images Done\n'.format(len(clean_file_list)))