VBA免安装扫描PDF OCR完整代码

Vibe Coding 使用 AI 回答

代码问题:

用VBA编写一个读取扫描版PDF的程序,使用Tesseract库和Poppler,做成免安装的版本,增加一个文件选择的窗口,直接写成完整版的代码,方便拷入内网使用

代码解决方案:

以下是一个完整的VBA脚本,用于读取扫描版PDF并进行OCR识别。这个版本使用了外部库(Tesseract和Poppler),并包含了文件选择窗口:

Option Explicit

' 引用的API声明
Private Declare PtrSafe Function ShellExecute Lib "shell32.dll" Alias "ShellExecuteA" ( _
    ByVal hwnd As LongPtr, _
    ByVal lpOperation As String, _
    ByVal lpFile As String, _
    ByVal lpParameters As String, _
    ByVal lpDirectory As String, _
    ByVal nShowCmd As Long) As LongPtr

Private Declare PtrSafe Function GetTempPath Lib "kernel32" Alias "GetTempPathA" ( _
    ByVal nBufferLength As Long, _
    ByVal lpBuffer As String) As Long

' 全局变量
Private Const SW_SHOWNORMAL = 1
Private Const PDF_TO_IMAGE_EXE = "pdftoppm.exe"
Private Const TESSERACT_EXE = "tesseract.exe"

' 主程序入口
Sub PDFOCRExtractor()
    Dim filePath As String
    Dim outputFolder As String
    Dim tempFolder As String
    
    ' 显示文件选择对话框
    filePath = GetPDFFilePath()
    
    If filePath = "" Then
        MsgBox "未选择文件", vbExclamation
        Exit Sub
    End If
    
    ' 获取临时文件夹
    tempFolder = GetTempDirectory()
    
    ' 创建输出文件夹
    outputFolder = tempFolder & "PDFOutput\"
    CreateFolder outputFolder
    
    ' 转换PDF为图像
    ConvertPDFToImages filePath, outputFolder
    
    ' 执行OCR识别
    PerformOCR outputFolder
    
    ' 打开输出文件夹
    ShellExecute 0, "open", outputFolder, "", "", SW_SHOWNORMAL
End Sub

' 获取PDF文件路径
Private Function GetPDFFilePath() As String
    Dim fd As FileDialog
    
    Set fd = Application.FileDialog(msoFileDialogFilePicker)
    
    With fd
        .Title = "选择PDF文件"
        .Filters.Clear
        .Filters.Add "PDF文件", "*.pdf", 1
        .AllowMultiSelect = False
        
        If .Show = -1 Then
            GetPDFFilePath = .SelectedItems(1)
        Else
            GetPDFFilePath = ""
        End If
    End With
End Function

' 创建文件夹
Private Sub CreateFolder(folderPath As String)
    Dim fso As Object
    Set fso = CreateObject("Scripting.FileSystemObject")
    
    If Not fso.FolderExists(folderPath) Then
        fso.CreateFolder folderPath
    End If
End Sub

' 获取临时文件夹路径
Private Function GetTempDirectory() As String
    Dim tempPath As String * 260
    Dim length As Long
    
    length = GetTempPath(260, tempPath)
    GetTempDirectory = Left(tempPath, length)
End Function

' 转换PDF为图像
Private Sub ConvertPDFToImages(pdfPath As String, outputFolder As String)
    Dim shellCommand As String
    
    ' 构建pdftoppm命令
    shellCommand = PDF_TO_IMAGE_EXE & " """ & pdfPath & """ """ & outputFolder & "page"" -png"
    
    ' 执行转换
    ShellExecute 0, "open", "cmd.exe", "/c " & shellCommand, "", SW_SHOWNORMAL
    
    ' 等待转换完成(可能需要调整等待时间)
    Application.Wait Now + TimeValue("00:00:05")
End Sub

' 执行OCR识别
Private Sub PerformOCR(imageFolder As String)
    Dim fso As Object
    Dim folder As Object
    Dim file As Object
    Dim outputPath As String
    Dim shellCommand As String
    
    Set fso = CreateObject("Scripting.FileSystemObject")
    Set folder = fso.GetFolder(imageFolder)
    
    ' 遍历图像文件并执行OCR
    For Each file In folder.Files
        If LCase(Right(file.Name, 4)) = ".png" Then
            outputPath = Replace(file.Path, ".png", ".txt")
            
            ' 构建Tesseract命令
            shellCommand = TESSERACT_EXE & " """ & file.Path & """ """ & Replace(file.Path, ".png", "") & """ -l chi_sim"
            
            ' 执行OCR
            ShellExecute 0, "open", "cmd.exe", "/c " & shellCommand, "", SW_SHOWNORMAL
        End If
    Next file
    
    ' 等待OCR完成(可能需要调整等待时间)
    Application.Wait Now + TimeValue("00:00:10")
End Sub

' 辅助函数:合并所有识别的文本文件
Sub MergeOCRResults()
    Dim fso As Object
    Dim folder As Object
    Dim file As Object
    Dim outputFile As String
    Dim textContent As String
    
    Set fso = CreateObject("Scripting.FileSystemObject")
    outputFile = GetTempDirectory() & "PDFOutput\merged_ocr_result.txt"
    
    Set folder = fso.GetFolder(GetTempDirectory() & "PDFOutput\")
    
    ' 遍历并合并文本文件
    For Each file In folder.Files
        If LCase(Right(file.Name, 4)) = ".txt" Then
            textContent = textContent & ReadTextFile(file.Path) & vbNewLine & "---页面分隔---" & vbNewLine
        End If
    Next file
    
    ' 写入合并后的文本
    WriteTextFile outputFile, textContent
    
    ' 打开合并后的文件
    ShellExecute 0, "open", outputFile, "", "", SW_SHOWNORMAL
End Sub

' 读取文本文件内容
Private Function ReadTextFile(filePath As String) As String
    Dim fso As Object
    Dim textFile As Object
    
    Set fso = CreateObject("Scripting.FileSystemObject")
    Set textFile = fso.OpenTextFile(filePath, 1)
    
    ReadTextFile = textFile.ReadAll
    
    textFile.Close
End Function

' 写入文本文件
Private Sub WriteTextFile(filePath As String, content As String)
    Dim fso As Object
    Dim textFile As Object
    
    Set fso = CreateObject("Scripting.FileSystemObject")
    Set textFile = fso.CreateTextFile(filePath, True)
    
    textFile.Write content
    textFile.Close
End Sub

使用说明:

  1. 需要准备以下外部程序(建议放在与Excel文件相同的目录):

    • pdftoppm.exe(Poppler工具)
    • tesseract.exe(Tesseract OCR)
    • 中文语言包(chi_sim)
  2. 运行步骤:

    • 点击运行PDFOCRExtractor
    • 选择需要识别的PDF文件
    • 程序会自动转换和识别
    • 最终结果会保存在临时文件夹中
  3. 可以运行MergeOCRResults宏合并所有识别结果

注意事项:

  • 需要提前下载Poppler和Tesseract的Windows版本
  • 可能需要调整等待时间
  • 大文件可能需要更长的处理时间

建议在内网环境中使用,并根据具体情况调整代码。