#!/usr/bin/env python3 """ 磁盘大文件扫描工具 智能扫描目录,找出大文件和大目录,自动跳过零碎文件目录 """ import os import sys import argparse from pathlib import Path from collections import defaultdict from typing import Optional, Set, Tuple # 默认跳过的目录模式(零碎文件目录) SKIP_PATTERNS = { # 依赖/包管理 'node_modules', '.npm', '.yarn', '.pnpm-store', 'venv', 'env', '.venv', '.env', 'site-packages', '__pycache__', '.mypy_cache', '.pytest_cache', '.tox', 'target', 'build', 'dist', '.gradle', '.m2', '.mvn', 'vendor', 'Pods', 'Carthage', 'DerivedData', # 版本控制 '.git', '.svn', '.hg', '.bzr', # IDE/编辑器 '.idea', '.vscode', '.vs', '*.egg-info', # 缓存/临时 '.cache', 'cache', 'Cache', '__pycache__', 'tmp', 'temp', 'Temp', '.tmp', '.cache', 'Library/Caches', # 系统回收站 '$RECYCLE.BIN', '.Trash', '.Trashes', # 其他常见零碎目录 '.cargo', '.rustup', 'go/pkg', '.nuget', 'packages', } # 默认显示设置 DEFAULT_TOP_N = 20 # 每层显示前N个 DEFAULT_FILE_THRESHOLD = 100 # 文件数阈值,超过视为"程序包" DEFAULT_SIZE_THRESHOLD = 10 * 1024 * 1024 # 10MB,小于此不深入 def format_size(size: int) -> str: """格式化文件大小""" for unit in ['B', 'KB', 'MB', 'GB', 'TB']: if size < 1024: return f"{size:.1f}{unit}" if size >= 1 else f"{size:.2f}{unit}" size /= 1024 return f"{size:.1f}PB" def should_skip(name: str) -> bool: """判断目录是否应该跳过深入扫描""" name_lower = name.lower() for pattern in SKIP_PATTERNS: if pattern.lower() == name_lower: return True if pattern.startswith('*.') and name_lower.endswith(pattern[1:].lower()): return True return False def get_dir_info(path: Path, skip_dirs: Set[str] = None) -> Tuple[int, int, bool]: """ 获取目录信息:总大小、文件数、是否为零碎目录 返回: (size, file_count, is_fragmented) """ if skip_dirs is None: skip_dirs = set() total_size = 0 file_count = 0 subdirs = [] try: for entry in os.scandir(path): try: if entry.is_file(follow_symlinks=False): total_size += entry.stat(follow_symlinks=False).st_size file_count += 1 elif entry.is_dir(follow_symlinks=False): if entry.name not in skip_dirs: subdirs.append(entry.path) except (OSError, PermissionError): pass except (OSError, PermissionError): pass return total_size, file_count, subdirs def scan_directory( path: Path, depth: int = 0, max_depth: int = -1, skip_patterns: Set[str] = None, file_threshold: int = DEFAULT_FILE_THRESHOLD, size_threshold: int = DEFAULT_SIZE_THRESHOLD, all_results: list = None, prefix: str = "" ) -> list: """ 递归扫描目录 Args: path: 扫描路径 depth: 当前深度 max_depth: 最大深度(-1表示无限) skip_patterns: 跳过的目录名模式 file_threshold: 文件数阈值 size_threshold: 大小阈值 all_results: 收集所有结果 prefix: 显示前缀(树形结构) Returns: 结果列表 """ if skip_patterns is None: skip_patterns = SKIP_PATTERNS if all_results is None: all_results = [] if max_depth > 0 and depth >= max_depth: return all_results # 收集当前目录下的所有条目 entries_data = [] try: for entry in os.scandir(path): try: if entry.is_file(follow_symlinks=False): size = entry.stat(follow_symlinks=False).st_size entries_data.append({ 'name': entry.name, 'path': entry.path, 'size': size, 'type': 'file' }) elif entry.is_dir(follow_symlinks=False): # 快速判断是否应该跳过 if should_skip(entry.name): # 直接统计,不深入 dir_size = 0 file_count = 0 for root, dirs, files in os.walk(entry.path, onerror=lambda x: None): # 阻止进一步递归skip目录 dirs[:] = [d for d in dirs if not should_skip(d)] for f in files: try: dir_size += os.path.getsize(os.path.join(root, f)) file_count += 1 except (OSError, PermissionError): pass entries_data.append({ 'name': entry.name, 'path': entry.path, 'size': dir_size, 'type': 'skipped_dir', 'file_count': file_count, 'reason': 'known_pattern' }) else: # 正常目录,获取基本信息 size, count, subdirs = get_dir_info(entry.path, skip_patterns) # 判断是否为零碎目录 is_fragmented = count > file_threshold entries_data.append({ 'name': entry.name, 'path': entry.path, 'size': size, 'type': 'fragmented_dir' if is_fragmented else 'dir', 'file_count': count }) except (OSError, PermissionError): pass except (OSError, PermissionError) as e: return all_results # 按大小排序 entries_data.sort(key=lambda x: x['size'], reverse=True) # 记录当前目录信息 total_size = sum(e['size'] for e in entries_data) # 打印当前层级 if depth == 0: print(f"\n📁 {path}") print(f" 总大小: {format_size(total_size)}") print("-" * 60) # 显示条目 for i, entry in enumerate(entries_data): is_last = i == len(entries_data) - 1 # 构建树形前缀 connector = "└── " if is_last else "├── " child_prefix = " " if is_last else "│ " current_prefix = prefix + connector size_str = format_size(entry['size']) name = entry['name'] # 类型标识 type_indicator = "" if entry['type'] == 'file': type_indicator = "📄" elif entry['type'] == 'skipped_dir': type_indicator = f"📦 [{entry['file_count']} files, skipped]" elif entry['type'] == 'fragmented_dir': type_indicator = f"📁 [{entry['file_count']} files, fragmented]" else: type_indicator = "📁" # 格式化输出 print(f"{prefix}{connector}{type_indicator} {name}: {size_str}") # 递归处理普通目录 if entry['type'] == 'dir' and entry['size'] > size_threshold: # 只深入大目录 scan_directory( Path(entry['path']), depth=depth + 1, max_depth=max_depth, skip_patterns=skip_patterns, file_threshold=file_threshold, size_threshold=size_threshold, all_results=all_results, prefix=prefix + child_prefix ) return all_results def scan_and_report( path: str, max_depth: int = -1, top_n: int = DEFAULT_TOP_N, file_threshold: int = DEFAULT_FILE_THRESHOLD, size_threshold: int = DEFAULT_SIZE_THRESHOLD ): """ 扫描目录并生成报告 """ target = Path(path).resolve() if not target.exists(): print(f"错误: 路径不存在 - {path}") sys.exit(1) if not target.is_dir(): print(f"错误: 不是目录 - {path}") sys.exit(1) print(f"\n{'='*60}") print(f" 磁盘空间扫描报告") print(f"{'='*60}") print(f" 目标: {target}") print(f" 文件数阈值: {file_threshold} (超过视为零碎目录)") print(f" 大小阈值: {format_size(size_threshold)} (小于此值不深入)") if max_depth > 0: print(f" 最大深度: {max_depth}") else: print(f" 最大深度: 无限制") print(f"{'='*60}") scan_directory( target, max_depth=max_depth, file_threshold=file_threshold, size_threshold=size_threshold ) print(f"\n{'='*60}") print(" 扫描完成") print(f"{'='*60}\n") def main(): parser = argparse.ArgumentParser( description='磁盘大文件扫描工具 - 智能扫描找出大文件和大目录', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=''' 示例: # 扫描当前目录 python disk_scanner.py . # 扫描指定目录,限制深度3层 python disk_scanner.py C:\\Users -d 3 # 自定义阈值 python disk_scanner.py D:\\ -f 200 -s 50M ''' ) parser.add_argument('path', help='要扫描的目录路径') parser.add_argument('-d', '--depth', type=int, default=-1, help='最大扫描深度(默认无限制)') parser.add_argument('-n', '--top', type=int, default=DEFAULT_TOP_N, help=f'每层显示前N个条目(默认{DEFAULT_TOP_N})') parser.add_argument('-f', '--file-threshold', type=int, default=DEFAULT_FILE_THRESHOLD, help=f'文件数阈值,超过视为零碎目录(默认{DEFAULT_FILE_THRESHOLD})') parser.add_argument('-s', '--size-threshold', type=str, default='10M', help='大小阈值,小于此值不深入扫描(默认10M,支持K/M/G后缀)') args = parser.parse_args() # 解析大小阈值 size_str = args.size_threshold.upper() if size_str.endswith('K'): size_threshold = int(size_str[:-1]) * 1024 elif size_str.endswith('M'): size_threshold = int(size_str[:-1]) * 1024 * 1024 elif size_str.endswith('G'): size_threshold = int(size_str[:-1]) * 1024 * 1024 * 1024 else: size_threshold = int(size_str) scan_and_report( args.path, max_depth=args.depth, top_n=args.top, file_threshold=args.file_threshold, size_threshold=size_threshold ) if __name__ == '__main__': main()