identify duplicate .mp3 files by size, then confirm is these are duoplicates by hash content' in python

main.py
import os
import hashlib

# Function to find duplicate files by size
def find_duplicate_by_size(directory):
    files_by_size = {}
    duplicates = []
    
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            file_size = os.path.getsize(file_path)
            if file_size in files_by_size:
                duplicates.append((file_path, files_by_size[file_size]))
            else:
                files_by_size[file_size] = file_path
                
    return duplicates

# Function to check duplicate files by content using hashing
def find_duplicate_by_content(duplicates):
    duplicate_files = []
    for file1, file2 in duplicates:
        with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
            hash1 = hashlib.sha256(f1.read()).hexdigest()
            hash2 = hashlib.sha256(f2.read()).hexdigest()
            if hash1 == hash2:
                duplicate_files.append((file1, file2))
    
    return duplicate_files

# Steps to find duplicates
directory = "path_to_directory"
duplicates_by_size = find_duplicate_by_size(directory)
duplicate_files = find_duplicate_by_content(duplicates_by_size)

print(duplicate_files)
1243 chars
38 lines

This code first finds duplicate .mp3 files by comparing their sizes and then confirms if they are duplicates by comparing their content using hash values.

related categories

gistlibby LogSnag