/cyb/ image stats
ls */* | wc -l
15700
du -sh .
7,5G
import os
import hashlib
from collections import defaultdict
def calculate_checksum(file_path):
"""Calculate the SHA-256 checksum of a file."""
hasher = hashlib.sha256()
with open(file_path, 'rb') as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
def find_duplicates_and_sizes(base_dir):
"""Find duplicate files and their sizes."""
checksums = defaultdict(list)
# Iterate through all files in the subdirectories
for root, dirs, files in os.walk(base_dir):
for file in files:
file_path = os.path.join(root, file)
try:
checksum = calculate_checksum(file_path)
file_size = os.path.getsize(file_path)
checksums[checksum].append((file_path, file_size))
except Exception as e:
print(f"Error processing file {file_path}: {e}")
return checksums
def human_readable_size(size):
"""Convert bytes to a human-readable format."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size < 1024:
return f"{size:.2f} {unit}"
size /= 1024
return f"{size:.2f} PB" # In case of very large sizes
def output_total_stats(duplicates):
"""Output the total statistics of duplicates."""
total_dupes = 0
total_bytes = 0
for files in duplicates.values():
if len(files) > 1: # Only interested in duplicates
total_dupes += len(files)
total_bytes += sum(size for _, size in files)
print(f"Total duplicates: {total_dupes}")
print(f"Total bytes of duplicates: {human_readable_size(total_bytes)}")
if __name__ == "__main__":
base_directory = './' # Change this to the desired base directory
duplicates = find_duplicates_and_sizes(base_directory)
output_total_stats(duplicates)
python3 dupe.py
Total duplicates: 4854
Total bytes of duplicates: 2.80 GB