用VBA编写一个读取扫描版PDF的程序,使用Tesseract库和Poppler,做成免安装的版本,增加一个文件选择的窗口,直接写成完整版的代码,方便拷入内网使用
以下是一个完整的VBA脚本,用于读取扫描版PDF并进行OCR识别。这个版本使用了外部库(Tesseract和Poppler),并包含了文件选择窗口:
Option Explicit
' 引用的API声明
Private Declare PtrSafe Function ShellExecute Lib "shell32.dll" Alias "ShellExecuteA" ( _
ByVal hwnd As LongPtr, _
ByVal lpOperation As String, _
ByVal lpFile As String, _
ByVal lpParameters As String, _
ByVal lpDirectory As String, _
ByVal nShowCmd As Long) As LongPtr
Private Declare PtrSafe Function GetTempPath Lib "kernel32" Alias "GetTempPathA" ( _
ByVal nBufferLength As Long, _
ByVal lpBuffer As String) As Long
' 全局变量
Private Const SW_SHOWNORMAL = 1
Private Const PDF_TO_IMAGE_EXE = "pdftoppm.exe"
Private Const TESSERACT_EXE = "tesseract.exe"
' 主程序入口
Sub PDFOCRExtractor()
Dim filePath As String
Dim outputFolder As String
Dim tempFolder As String
' 显示文件选择对话框
filePath = GetPDFFilePath()
If filePath = "" Then
MsgBox "未选择文件", vbExclamation
Exit Sub
End If
' 获取临时文件夹
tempFolder = GetTempDirectory()
' 创建输出文件夹
outputFolder = tempFolder & "PDFOutput\"
CreateFolder outputFolder
' 转换PDF为图像
ConvertPDFToImages filePath, outputFolder
' 执行OCR识别
PerformOCR outputFolder
' 打开输出文件夹
ShellExecute 0, "open", outputFolder, "", "", SW_SHOWNORMAL
End Sub
' 获取PDF文件路径
Private Function GetPDFFilePath() As String
Dim fd As FileDialog
Set fd = Application.FileDialog(msoFileDialogFilePicker)
With fd
.Title = "选择PDF文件"
.Filters.Clear
.Filters.Add "PDF文件", "*.pdf", 1
.AllowMultiSelect = False
If .Show = -1 Then
GetPDFFilePath = .SelectedItems(1)
Else
GetPDFFilePath = ""
End If
End With
End Function
' 创建文件夹
Private Sub CreateFolder(folderPath As String)
Dim fso As Object
Set fso = CreateObject("Scripting.FileSystemObject")
If Not fso.FolderExists(folderPath) Then
fso.CreateFolder folderPath
End If
End Sub
' 获取临时文件夹路径
Private Function GetTempDirectory() As String
Dim tempPath As String * 260
Dim length As Long
length = GetTempPath(260, tempPath)
GetTempDirectory = Left(tempPath, length)
End Function
' 转换PDF为图像
Private Sub ConvertPDFToImages(pdfPath As String, outputFolder As String)
Dim shellCommand As String
' 构建pdftoppm命令
shellCommand = PDF_TO_IMAGE_EXE & " """ & pdfPath & """ """ & outputFolder & "page"" -png"
' 执行转换
ShellExecute 0, "open", "cmd.exe", "/c " & shellCommand, "", SW_SHOWNORMAL
' 等待转换完成(可能需要调整等待时间)
Application.Wait Now + TimeValue("00:00:05")
End Sub
' 执行OCR识别
Private Sub PerformOCR(imageFolder As String)
Dim fso As Object
Dim folder As Object
Dim file As Object
Dim outputPath As String
Dim shellCommand As String
Set fso = CreateObject("Scripting.FileSystemObject")
Set folder = fso.GetFolder(imageFolder)
' 遍历图像文件并执行OCR
For Each file In folder.Files
If LCase(Right(file.Name, 4)) = ".png" Then
outputPath = Replace(file.Path, ".png", ".txt")
' 构建Tesseract命令
shellCommand = TESSERACT_EXE & " """ & file.Path & """ """ & Replace(file.Path, ".png", "") & """ -l chi_sim"
' 执行OCR
ShellExecute 0, "open", "cmd.exe", "/c " & shellCommand, "", SW_SHOWNORMAL
End If
Next file
' 等待OCR完成(可能需要调整等待时间)
Application.Wait Now + TimeValue("00:00:10")
End Sub
' 辅助函数:合并所有识别的文本文件
Sub MergeOCRResults()
Dim fso As Object
Dim folder As Object
Dim file As Object
Dim outputFile As String
Dim textContent As String
Set fso = CreateObject("Scripting.FileSystemObject")
outputFile = GetTempDirectory() & "PDFOutput\merged_ocr_result.txt"
Set folder = fso.GetFolder(GetTempDirectory() & "PDFOutput\")
' 遍历并合并文本文件
For Each file In folder.Files
If LCase(Right(file.Name, 4)) = ".txt" Then
textContent = textContent & ReadTextFile(file.Path) & vbNewLine & "---页面分隔---" & vbNewLine
End If
Next file
' 写入合并后的文本
WriteTextFile outputFile, textContent
' 打开合并后的文件
ShellExecute 0, "open", outputFile, "", "", SW_SHOWNORMAL
End Sub
' 读取文本文件内容
Private Function ReadTextFile(filePath As String) As String
Dim fso As Object
Dim textFile As Object
Set fso = CreateObject("Scripting.FileSystemObject")
Set textFile = fso.OpenTextFile(filePath, 1)
ReadTextFile = textFile.ReadAll
textFile.Close
End Function
' 写入文本文件
Private Sub WriteTextFile(filePath As String, content As String)
Dim fso As Object
Dim textFile As Object
Set fso = CreateObject("Scripting.FileSystemObject")
Set textFile = fso.CreateTextFile(filePath, True)
textFile.Write content
textFile.Close
End Sub
使用说明:
需要准备以下外部程序(建议放在与Excel文件相同的目录):
运行步骤:
PDFOCRExtractor宏可以运行MergeOCRResults宏合并所有识别结果
注意事项:
建议在内网环境中使用,并根据具体情况调整代码。