VisCGEC/ocr_processor.py at main · JettyCoffee/VisCGEC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import sys
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import subprocess
from pathlib import Path
import logging
import math

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class OCRProcessor:
    def __init__(
        self,
        input_dir,
        output_dir,
        det_model_dir="models/PaddleOCR/ppstructure/inference/PP-OCRv5_server_det_infer",
        rec_model_dir="models/PaddleOCR/ppstructure/inference/PP-OCRv5_server_rec_infer",
        rec_char_dict_path="models/PaddleOCR/ppocr/utils/ppocrv5_dict.txt",
        table_model_dir="models/PaddleOCR/ppstructure/inference/ch_ppstructure_mobile_v2.0_SLANet_infer",
        table_char_dict_path="models/PaddleOCR/ppocr/utils/dict/table_structure_dict_ch.txt",
        layout_model_dir="models/PaddleOCR/ppstructure/inference/picodet_lcnet_x1_0_fgd_layout_cdla_infer",
        layout_dict_path="models/PaddleOCR/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt",
        vis_font_path="models/PaddleOCR/doc/fonts/chinese_cht.ttf",
        max_workers_per_gpu=4,
        gpu_ids=[0, 1]  # 指定要使用的GPU ID
    ):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.det_model_dir = det_model_dir
        self.rec_model_dir = rec_model_dir
        self.rec_char_dict_path = rec_char_dict_path
        self.table_model_dir = table_model_dir
        self.table_char_dict_path = table_char_dict_path
        self.layout_model_dir = layout_model_dir
        self.layout_dict_path = layout_dict_path
        self.vis_font_path = vis_font_path
        self.max_workers_per_gpu = max_workers_per_gpu
        self.gpu_ids = gpu_ids

        # 确保输出目录存在
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def process_single_image(self, args):
        """处理单张图片"""
        image_path, gpu_id = args
        try:
            output_dir = self.output_dir / image_path.stem
            output_dir.mkdir(parents=True, exist_ok=True)

            # 设置环境变量指定GPU
            env = os.environ.copy()
            env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

            cmd = [
                "python", "models/PaddleOCR/ppstructure/predict_system_enhanced.py",
                f"--image_dir={str(image_path)}",
                f"--det_model_dir={self.det_model_dir}",
                f"--rec_model_dir={self.rec_model_dir}",
                f"--rec_char_dict_path={self.rec_char_dict_path}",
                f"--table_model_dir={self.table_model_dir}",
                f"--table_char_dict_path={self.table_char_dict_path}",
                f"--layout_model_dir={self.layout_model_dir}",
                f"--layout_dict_path={self.layout_dict_path}",
                f"--vis_font_path={self.vis_font_path}",
                f"--output={str(output_dir)}",
                "--return_word_box=True",
                "--ocr=True",
                "--table=False",
                "--layout=True",
                "--use_angle_cls=False",
                "--use_mp=True",
            ]

            logging.info(f"Processing image: {image_path} on GPU {gpu_id}")
            result = subprocess.run(cmd, capture_output=True, text=True, env=env)

            if result.returncode == 0:
                logging.info(f"Successfully processed {image_path} on GPU {gpu_id}")
            else:
                logging.error(f"Error processing {image_path} on GPU {gpu_id}: {result.stderr}")

        except Exception as e:
            logging.error(f"Error processing {image_path} on GPU {gpu_id}: {str(e)}")

    def process_all_images(self):
        """多GPU并行处理所有图片"""
        image_files = [
            f for f in self.input_dir.glob("*")
            if f.suffix.lower() in {'.jpg', '.jpeg', '.png', '.JPG'}
        ]

        total_images = len(image_files)
        logging.info(f"Found {total_images} images to process")

        # 为每个图片分配GPU
        tasks = []
        for i, image_file in enumerate(image_files):
            gpu_id = self.gpu_ids[i % len(self.gpu_ids)]
            tasks.append((image_file, gpu_id))

        # 按GPU分组任务
        gpu_tasks = {}
        for gpu_id in self.gpu_ids:
            gpu_tasks[gpu_id] = [task for task in tasks if task[1] == gpu_id]

        # 创建进程池，每个GPU一个进程
        with ProcessPoolExecutor(max_workers=len(self.gpu_ids)) as process_executor:
            # 提交每个GPU的任务批次
            futures = []
            for gpu_id, gpu_batch in gpu_tasks.items():
                future = process_executor.submit(self._process_gpu_batch, gpu_batch)
                futures.append(future)

            # 等待所有任务完成
            for future in futures:
                future.result()

        logging.info("所有图片处理完成")

    def _process_gpu_batch(self, gpu_tasks):
        """处理单个GPU上的任务批次"""
        with ThreadPoolExecutor(max_workers=self.max_workers_per_gpu) as thread_executor:
            list(thread_executor.map(self.process_single_image, gpu_tasks))

if __name__ == "__main__":
    # 设置输入输出目录
    input_dir = "data/preprocessed_img"
    output_dir = "data/paddleocr_version/ocr_output"

    # 创建OCR处理器实例
    processor = OCRProcessor(
        input_dir=input_dir,
        output_dir=output_dir,
        max_workers_per_gpu=6,
        gpu_ids=[0, 1]
    )

    # 处理所有图片
    processor.process_all_images()