相關庫
.Net 8
編碼識別: github.com/CharsetDetector/UTF-unknown
<PackageReference Include="UTF.Unknown" Version="2.5.1" />
代碼
using UtfUnknown;var dir_path = "D:\\Desktop\\新建文件夾2\\新建文件夾";
var dir_new_path = "D:\\Desktop\\新建文件夾2\\utf8_files";
var dir_rest_path = "D:\\Desktop\\新建文件夾2\\rest_files";if (!Directory.Exists(dir_new_path)) Directory.CreateDirectory(dir_new_path);
if (!Directory.Exists(dir_rest_path)) Directory.CreateDirectory(dir_rest_path);var file_path_list = Directory.EnumerateFiles(dir_path);// 并行處理, 充分利用多核 CPU
file_path_list.AsParallel().ForAll(async file_path =>
{var file_info = new FileInfo(file_path);var file_new_path = Path.Combine(dir_new_path, file_info.Name);var file_rest_path = Path.Combine(dir_rest_path, file_info.Name);var file_bytes = await File.ReadAllBytesAsync(file_path);// 截取文件的開頭一點數據去分析,性能好, ( 整個文件分析,大文件太慢了. )var file_type = CharsetDetector.DetectFromBytes(file_bytes[0..100]) ?? throw new Exception($"未知類型 {file_path}");if (file_type.Detected.EncodingName == "gb18030"){// gb2312 -> c# 字符串 (utf-16)var file_str = file_type.Detected.Encoding.GetString(file_bytes);// utf-8,utf-8-bom,utf-16 有一點點差異 (具體自己查)// WriteAllText 默認就是 UTF8 No BOMawait File.WriteAllTextAsync(file_new_path, file_str);Console.WriteLine($"{file_info.Name} gb2312 -> utf8 完成");// 如果事先就知道所有文件是 gb2312 編碼,// 可以直接調用解析, 無需 UtfUnknown 去識別. 性能更快// 注冊編碼 寫在 for 外面, 注冊一次就行// Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);// 調用// var str = Encoding.GetEncoding("gb18030").GetString(file_bytes);}else if (file_type.Detected.EncodingName == "utf-8"){File.Copy(file_path, file_new_path);Console.WriteLine($"{file_info.Name} utf8 復制完成");}else{File.Copy(file_path, file_rest_path);Console.WriteLine($"warn: {file_info.Name} ${file_type.Detected.EncodingName} 復制完成 ");}
});// 保持主線程運行
Console.ReadLine();