macOS txt批量转utf-8

brew install enca
find . -iname "*.txt" -exec enconv -L zh_CN -x UTF-8 {} \;

有些UTF-16 LE的文件,这个命令可能会报错:

enconv: Cannot convert `./test.txt' from unknown encoding
enconv: Iconv conversion error on `/tmp/enca9HsBBK': Illegal byte sequence
Trying to recover... succeeded.

虽然说出错后他会自动恢复原文件,但还是原来的UTF-16 LE,所以还是不用enca了,改用python吧

"""
UTF-8 转换工具
用于将指定目录下的所有文本文件(.txt)转换为 UTF-8 编码格式。

使用方法:
- 修改 directory 变量,脚本会递归遍历其中的所有 txt 并转换为 UTF-8
- 具体转换结果可看日志:conversion_log.txt

requirements.txt:
    chardet==5.2.0
    tqdm==4.66.4
"""

import os
import chardet
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import logging
import sys

def convert_to_utf8(file_path):
    try:
        with open(file_path, 'rb') as f:
            data = f.read(5120)  # 读取前 5120 字节来检测
            result = chardet.detect(data)
            encoding = result['encoding']
        # 如果编码不是 UTF-8,则转换
        if encoding and encoding.lower() != 'utf-8':
            # 读取文件内容,并以检测到的编码格式重新编码
            with open(file_path, 'r', encoding=encoding, errors='replace') as f:
                content = f.read()
            # 将内容以 UTF-8 编码写入文件
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content)
            logging.info(f"SUCCESS! File '{file_path}' converted from '{encoding}' to 'utf-8'.")
            return True
        else:
            logging.info(f"SKIP! File '{file_path}' doesn't need encoding conversion. Detected encoding: {encoding}")
            return False
    except Exception as e:
        logging.error(f"ERROR! Error processing file: {file_path}, {e}")
        return False

def convert_txt_files_to_utf8(directory, log_file):
    logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'): # 要转换其他类型的文件,修改此处即可
                file_path = os.path.join(root, file)
                file_list.append(file_path)

    with ThreadPoolExecutor(max_workers=8) as executor:
        results = list(tqdm(executor.map(convert_to_utf8, file_list), total=len(file_list), desc='Converting files', unit='file'))

    logging.info("Conversion completed!")
    print("Conversion completed!")

if __name__ == "__main__":
    directory = "/Path/to/directory"  # 指定目录路径
    log_file = "conversion_log.txt"
    convert_txt_files_to_utf8(directory, log_file)

Last modification:May 12, 2024
V50%看看实力