PDFからテキスト抽出(すこし治した)
AppleScript サンプルコード
行番号 | ソース |
---|---|
001 | #!/usr/bin/env osascript |
002 | ----+----1----+----2----+-----3----+----4----+----5----+----6----+----7 |
003 | (* |
004 | com.cocolog-nifty.quicktimer.icefloe |
005 | PDFのテキストコンテンツを |
006 | PDFページ毎にテキストファイルとRTFファイルに書き出します |
007 | RTFファイルでは簡易にフォントとサイズの確認が出来ます |
008 | *) |
009 | ----+----1----+----2----+-----3----+----4----+----5----+----6----+----7 |
010 | use AppleScript version "2.8" |
011 | use framework "Foundation" |
012 | use framework "AppKit" |
013 | use framework "PDFKit" |
014 | use scripting additions |
015 | |
016 | property refMe : a reference to current application |
017 | |
018 | ######################## |
019 | #ダイアログ 入力 |
020 | set strName to (name of current application) as text |
021 | if strName is "osascript" then |
022 | tell application "Finder" to activate |
023 | else |
024 | tell current application to activate |
025 | end if |
026 | # デフォルトロケーション |
027 | set appFileManager to refMe's NSFileManager's defaultManager() |
028 | set ocidURLsArray to (appFileManager's URLsForDirectory:(refMe's NSDesktopDirectory) inDomains:(refMe's NSUserDomainMask)) |
029 | set ocidDesktopDirPathURL to ocidURLsArray's firstObject() |
030 | set aliasDesktopDirPath to (ocidDesktopDirPathURL's absoluteURL()) as alias |
031 | set listUTI to {"com.adobe.pdf"} |
032 | set strMes to ("PDFファイルを選んでください") as text |
033 | set strPrompt to ("PDFファイルを選んでください") as text |
034 | try |
035 | #ファイル選択時 |
036 | set aliasFilePath to (choose file strMes with prompt strPrompt default location (aliasDesktopDirPath) of type listUTI with invisibles and showing package contents without multiple selections allowed) as alias |
037 | on error |
038 | log "エラーしました" |
039 | return "エラーしました" |
040 | end try |
041 | ######################## |
042 | #入力ファイルパス |
043 | set strFilePath to (POSIX path of aliasFilePath) as text |
044 | set ocidFilePathStr to refMe's NSString's stringWithString:(strFilePath) |
045 | set ocidFilePath to ocidFilePathStr's stringByStandardizingPath() |
046 | set ocidFilePathURL to refMe's NSURL's alloc()'s initFileURLWithPath:(ocidFilePath) isDirectory:(false) |
047 | |
048 | ######################## |
049 | #ダイアログ 出力先フォルダ |
050 | (* ファイル名から自動生成に変更 |
051 | set strName to (name of current application) as text |
052 | if strName is "osascript" then |
053 | tell application "Finder" to activate |
054 | else |
055 | tell current application to activate |
056 | end if |
057 | # デフォルトロケーション |
058 | #選択したPDFファイルと同じディレクトリ |
059 | set ocidContainerDirPathURL to ocidFilePathURL's URLByDeletingLastPathComponent() |
060 | set aliasContainerDirPath to (ocidContainerDirPathURL's absoluteURL()) as alias |
061 | # |
062 | set strMes to ("保存先フォルダを選んでください\nページ数が多い場合はフォルダ作成した方がいいです") as text |
063 | set strPrompt to ("保存先フォルダを選んでください\nページ数が多い場合はフォルダ作成した方がいいです") as text |
064 | try |
065 | set aliasSaveDirPath to (choose folder strMes with prompt strPrompt default location aliasContainerDirPath with invisibles and showing package contents without multiple selections allowed) as alias |
066 | on error |
067 | log "エラーしました" |
068 | return "エラーしました" |
069 | end try |
070 | *) |
071 | #出力先フォルダパス |
072 | set ocidBaseFilePathURL to ocidFilePathURL's URLByDeletingPathExtension() |
073 | set strSaveDirName to (ocidFilePathURL's lastPathComponent())'s mutableCopy() |
074 | (strSaveDirName's appendString:("-テキスト抽出")) |
075 | #set strBaseFileName to (ocidFilePathURL's lastPathComponent()) as text |
076 | #set strSaveDirName to (strBaseFileName & "-テキスト抽出") |
077 | set ocidContainerDirPathURL to ocidFilePathURL's URLByDeletingLastPathComponent() |
078 | set ocidSaveDirPathURL to ocidContainerDirPathURL's URLByAppendingPathComponent:(strSaveDirName) isDirectory:(true) |
079 | #フォルダ作成 |
080 | set appFileManager to refMe's NSFileManager's defaultManager() |
081 | set ocidAttrDict to refMe's NSMutableDictionary's alloc()'s initWithCapacity:0 |
082 | ocidAttrDict's setValue:(448) forKey:(refMe's NSFilePosixPermissions) |
083 | set listBoolMakeDir to appFileManager's createDirectoryAtURL:(ocidSaveDirPathURL) withIntermediateDirectories:true attributes:(ocidAttrDict) |error| :(reference) |
084 | |
085 | ######################## |
086 | #NSDATA |
087 | set ocidOption to (refMe's NSDataReadingMappedIfSafe) |
088 | set listReadData to refMe's NSData's alloc()'s initWithContentsOfURL:(ocidFilePathURL) options:(ocidOption) |error| :(reference) |
089 | set ocidReadData to (item 1 of listReadData) |
090 | |
091 | ######################## |
092 | #PDFDocument |
093 | set ocidActiveDoc to refMe's PDFDocument's alloc()'s initWithData:(ocidReadData) |
094 | #総ページ数 |
095 | set numCntPage to ocidActiveDoc's pageCount() |
096 | #全ページテキスト |
097 | set ocidOutPutAllString to refMe's NSMutableString's alloc()'s initWithCapacity:(0) |
098 | #全ページ用のRTF |
099 | set ocidOutPutAttrAllString to refMe's NSMutableAttributedString's alloc()'s initWithString:("") |
100 | #RTF用の改行 |
101 | set ocidLineCode to refMe's NSMutableAttributedString's alloc()'s initWithString:("\n-----------\n") |
102 | #ページ数分繰り返し |
103 | repeat with itemIntNo from 0 to (numCntPage - 1) by 1 |
104 | #ページを取り出して |
105 | set ocidActivePage to (ocidActiveDoc's pageAtIndex:(itemIntNo)) |
106 | #ページ用テキスト |
107 | set ocidPageOutputString to (refMe's NSMutableString's alloc()'s initWithCapacity:(0)) |
108 | ################## |
109 | #テキストを抽出 |
110 | set ocidPageText to ocidActivePage's |string|() |
111 | #ページにテキスト要素が無い場合 |
112 | if ocidPageText = (missing value) then |
113 | set strSetValue to ((itemIntNo + 1) & "ページ目にはテキスト情報無") as text |
114 | set ocidPageText to (refMe's NSString's stringWithString:(strSetValue)) |
115 | end if |
116 | #全ページ用テキスト |
117 | (ocidPageOutputString's appendString:(ocidPageText)) |
118 | (ocidOutPutAllString's appendString:("■ページ : " & (itemIntNo + 1) & "\n")) |
119 | (ocidOutPutAllString's appendString:(ocidPageText)) |
120 | (ocidOutPutAllString's appendString:("\n---------------\n")) |
121 | #保存するファイル名 |
122 | set strSaveFileNameText to ((itemIntNo + 1) & ".txt") as text |
123 | #保存先パス |
124 | set ocidSaveFilePathURLText to (ocidSaveDirPathURL's URLByAppendingPathComponent:(strSaveFileNameText) isDirectory:(false)) |
125 | #保存 |
126 | set listDone to (ocidPageOutputString's writeToURL:(ocidSaveFilePathURLText) atomically:(true) encoding:(refMe's NSUTF8StringEncoding) |error| :(reference)) |
127 | if (item 1 of listDone) is true then |
128 | log (itemIntNo + 1) & "ページ目:正常終了" as text |
129 | else if (item 1 of listDone) is false then |
130 | log (item 2 of listDone)'s localizedDescription() as text |
131 | return (itemIntNo + 1) & "ページ目:保存に失敗しました" as text |
132 | end if |
133 | ################## |
134 | #アトリビュートテキストを抽出して |
135 | set ocidPageAttarText to ocidActivePage's attributedString() |
136 | if ocidPageAttarText = (missing value) then |
137 | set strSetValue to ((itemIntNo + 1) & "ページ目にはテキスト情報無") as text |
138 | set ocidPageAttarText to (refMe's NSMutableAttributedString's alloc()'s initWithString:(strSetValue)) |
139 | end if |
140 | #RTFデータに変換して |
141 | set ocidLength to ocidPageAttarText's |length|() |
142 | set ocidAttarTextRange to refMe's NSMakeRange(0, ocidLength) |
143 | set ocidAttarData to (ocidPageAttarText's RTFFromRange:(ocidAttarTextRange) documentAttributes:(missing value)) |
144 | #全ページ用データ |
145 | set ocidNewLineCode to (refMe's NSMutableAttributedString's alloc()'s initWithString:("■ページ : " & (itemIntNo + 1) & "\n")) |
146 | (ocidOutPutAttrAllString's appendAttributedString:(ocidNewLineCode)) |
147 | (ocidOutPutAttrAllString's appendAttributedString:(ocidPageAttarText)) |
148 | (ocidOutPutAttrAllString's appendAttributedString:(ocidLineCode)) |
149 | |
150 | #保存する |
151 | set strSaveFileNameRtf to ((itemIntNo + 1) & ".rtf") as text |
152 | set ocidSaveFilePathURLRtf to (ocidSaveDirPathURL's URLByAppendingPathComponent:(strSaveFileNameRtf) isDirectory:(false)) |
153 | set ocidOption to (refMe's NSDataWritingAtomic) |
154 | set listDone to (ocidAttarData's writeToURL:(ocidSaveFilePathURLRtf) options:(ocidOption) |error| :(reference)) |
155 | if (item 1 of listDone) is true then |
156 | log (itemIntNo + 1) & "ページ目:正常終了" as text |
157 | else if (item 1 of listDone) is false then |
158 | log (item 2 of listDone)'s localizedDescription() as text |
159 | return (itemIntNo + 1) & "ページ目:保存に失敗しました" as text |
160 | end if |
161 | |
162 | end repeat |
163 | |
164 | ################# |
165 | #テキスト |
166 | #保存するファイル名 |
167 | set strSaveFileNameText to ("_All.txt") as text |
168 | #保存先パス |
169 | set ocidSaveFilePathURLText to (ocidSaveDirPathURL's URLByAppendingPathComponent:(strSaveFileNameText) isDirectory:(false)) |
170 | #保存 |
171 | set listDone to (ocidOutPutAllString's writeToURL:(ocidSaveFilePathURLText) atomically:(true) encoding:(refMe's NSUTF8StringEncoding) |error| :(reference)) |
172 | if (item 1 of listDone) is false then |
173 | log (item 2 of listDone)'s localizedDescription() as text |
174 | return "エラー:保存に失敗しました" as text |
175 | end if |
176 | |
177 | ################# |
178 | #RTF |
179 | #RTFデータに変換して |
180 | set ocidLength to ocidOutPutAttrAllString's |length|() |
181 | set ocidAttarTextRange to refMe's NSMakeRange(0, ocidLength) |
182 | set ocidAttarData to (ocidOutPutAttrAllString's RTFFromRange:(ocidAttarTextRange) documentAttributes:(missing value)) |
183 | #RTF保存する |
184 | set strSaveFileNameRtf to ("_All.rtf") as text |
185 | set ocidSaveFilePathURLRtf to (ocidSaveDirPathURL's URLByAppendingPathComponent:(strSaveFileNameRtf) isDirectory:(false)) |
186 | set ocidOption to (refMe's NSDataWritingAtomic) |
187 | set listDone to (ocidAttarData's writeToURL:(ocidSaveFilePathURLRtf) options:(ocidOption) |error| :(reference)) |
188 | if (item 1 of listDone) is true then |
189 | log (itemIntNo + 1) & "ページ目:正常終了" as text |
190 | else if (item 1 of listDone) is false then |
191 | log (item 2 of listDone)'s localizedDescription() as text |
192 | return (itemIntNo + 1) & "ページ目:保存に失敗しました" as text |
193 | end if |
194 | #保存先を開く |
195 | set appSharedWorkspace to refMe's NSWorkspace's sharedWorkspace() |
196 | set boolDone to appSharedWorkspace's openURL:(ocidSaveDirPathURL) |
197 | |
198 | return "終了" |
AppleScriptで生成しました |
| 固定リンク