Option Explicit
Private Declare PtrSafe Function ShellExecute Lib "shell32.dll" Alias "ShellExecuteA" ( _ ByVal hwnd As LongPtr, _ ByVal lpOperation As String, _ ByVal lpFile As String, _ ByVal lpParameters As String, _ ByVal lpDirectory As String, _ ByVal nShowCmd As Long) As LongPtr
Private Const SW_SHOWNORMAL = 1 Private gPdfPath As String Private gOutputPath As String
' 主程序入口 Sub PDFTextExtractor() Dim fso As Object Dim outputFile As Object Dim startTime As Double Dim timeoutSeconds As Integer Dim fileExists As Boolean Dim tesseractPath As String Dim fsoCheck As Object
' 选择PDF文件
If Not SelectPDFFile() Then Exit Sub
' 检查tesseract.exe是否存在(假设放在工作簿所在目录)
tesseractPath = ThisWorkbook.Path & "\tesseract.exe"
Set fsoCheck = CreateObject("Scripting.FileSystemObject")
If Not fsoCheck.fileExists(tesseractPath) Then
MsgBox "未找到 tesseract.exe,请将其放在与工作簿相同的目录下。", vbCritical
Exit Sub
End If
继续你的代码,看起来是一个PDF文本提取的VBA宏。我可以帮你完善这个宏。以下是代码的后续部分:
' 设置输出路径
gOutputPath = ThisWorkbook.Path & "\output.txt"
' 创建文件系统对象
Set fso = CreateObject("Scripting.FileSystemObject")
' 删除之前的输出文件(如果存在)
If fso.FileExists(gOutputPath) Then
fso.DeleteFile gOutputPath
End If
' 设置超时时间(秒)
timeoutSeconds = 300 ' 5分钟
' 记录开始时间
startTime = Timer
' 调用Tesseract提取PDF文本
Dim shellResult As LongPtr
shellResult = ShellExecute(0, "open", tesseractPath, """" & gPdfPath & """ """ & Left(gOutputPath, Len(gOutputPath) - 4) & """", "", SW_SHOWNORMAL)
' 等待文件生成,并设置超时
Do
fileExists = fso.FileExists(gOutputPath)
' 检查是否超时
If Timer - startTime > timeoutSeconds Then
MsgBox "文本提取超时,可能PDF文件过大或Tesseract处理失败。", vbExclamation
Exit Sub
End If
' 给系统一些处理时间
DoEvents
Loop Until fileExists
' 打开输出文件
Set outputFile = fso.OpenTextFile(gOutputPath, 1)
' 读取并显示文本
Dim extractedText As String
extractedText = outputFile.ReadAll
outputFile.Close
' 将文本显示在新工作表
DisplayTextInWorksheet extractedText
' 提示完成
MsgBox "PDF文本提取完成!", vbInformation
End Sub
' 选择PDF文件
Function SelectPDFFile() As Boolean
Dim fd As FileDialog
Set fd = Application.FileDialog(msoFileDialogFilePicker)
With fd
.Title = "选择PDF文件"
.Filters.Clear
.Filters.Add "PDF文件", "*.pdf", 1
.AllowMultiSelect = False
If .Show = -1 Then
gPdfPath = .SelectedItems(1)
SelectPDFFile = True
Else
SelectPDFFile = False
End If
End With
End Function
' 在新工作表显示文本
Sub DisplayTextInWorksheet(extractedText As String)
Dim ws As Worksheet
' 创建新工作表
Set ws = ThisWorkbook.Worksheets.Add(After:=ThisWorkbook.Worksheets(ThisWorkbook.Worksheets.Count))
ws.Name = "PDF文本"
' 在A1单元格写入文本
ws.Range("A1").Value = extractedText
' 自动调整列宽
ws.Columns("A:A").AutoFit
End Sub
这个VBA宏实现了以下功能:
使用说明:
tesseract.exe注意事项:
你可以根据实际需求进一步修改和优化代码。