Python || Pdf Split & Extract Using PyPdf
The following is a simple pdf file split & extractor program which utilizes the “pyPdf” library to manipulate pdf files. This program has the ability to extract selected pages from an existing pdf file, and save the extracted pages into a new pdf file.
REQUIRED KNOWLEDGE FOR THIS PROGRAM
PyPdf - What Is It?
How To Create Executable Python Programs
Display The Time In Python
Metadata With PyPdf
Pdf Split Executable File - Click Here To Download
This program first asks the user to place the pdf file(s) they wish to extract pages from into a specified folder. The default input folder is titled “Files To Extract.” After the input pdf file(s) have been placed into the specified input folder, the program prompts the user to select which file they wish to extract pages from. As soon as an input pdf file has been selected, the user is asked to enter in the page numbers they wish to extract from the specified input pdf file. After the page extraction is completed, the selected pages are merged into one single pdf file, and is saved into an output folder titled “Completed Extracted Files.”
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 |
# ============================================================================= # Author: K Perkins # Date: Aug 4, 2013 # Taken From: http://programmingnotes.org/ # File: PdfSplit.py # Description: This is a simple program utilizing the pyPdf library to # manipulate pdf files. This program has the ability to extract selected # pages from an existing pdf file, and save the extracted pages into # a new pdf file. # ============================================================================= import sys, os, datetime, platform from pyPdf.pdf import PdfFileWriter, PdfFileReader from pyPdf.generic import NameObject, createStringObject # ---- START GLOBAL VARIABLES ---- # INPUT_FILE_FOLDER = "Files To Extract" OUTPUT_FILE_FOLDER = "Completed Extracted Files" OUTPUT_FILE_NAME = "Extracted File.pdf" PDF_PRODUCER = "KENNETH'S PDF EXTRACTOR" # Determine the platform if(platform.system() == "Windows"): CURRENT_USER = os.environ.get("USERNAME") else: CURRENT_USER = os.environ.get("USER") # ---- END GLOBAL VARIABLES ---- # def DoesFileExists(fileName, fileFolder): # determine if a file exists try: filePath = os.path.join(fileFolder, fileName) with open(filePath) as f: pass return True except IOError as e: return False def DoesFolderExist(fileFolder): # determine if a folder exists if not os.path.exists(fileFolder): os.makedirs(fileFolder) return False return True def CheckOutFileDigits(outfile): # check if a file already exists in a folder char = outfile[len(outfile)-1] if(char == ")"): return True return False def GetPageNumbers(pageRange): # parse a page range (i.e: 1,2,5,56-100,241) and return its # integer equivalent pageIndex = 0 inDigit = False inNums2 = False nums = "" nums2 = "" pageNumbers = [] while(pageIndex < len(pageRange)): if(pageRange[pageIndex].isdigit()): inDigit = True else: inDigit = False if(inDigit): if(inNums2 == False): nums += pageRange[pageIndex] else: nums2 += pageRange[pageIndex] else: if(nums != "" and pageRange[pageIndex] == "," and inNums2 == False): pageNumbers.append(int(nums)) nums = "" elif(nums != "" and pageRange[pageIndex] == "-"): inNums2 = True elif(nums2 != "" and inNums2): for x in range(int(nums), int(nums2)+1): pageNumbers.append(x) nums = "" nums2 = "" inNums2 = False elif((nums != "" and pageRange[pageIndex] != ",") or (nums != "" and pageRange[pageIndex] != "-")): pageNumbers.append(int(nums)) nums = "" pageIndex += 1 # DO THIS IF NUMBERS ARE LEFT OVER FROM THE ABOVE LOOP ^ if(nums != "" and nums2 != ""): for x in range(int(nums), int(nums2)+1): pageNumbers.append(x) elif(nums != ""): pageNumbers.append(int(nums)) return pageNumbers def DisplayFiles(files): # display files in a folder numFiles = 1 print("Index # ||tFile Namen"+ "-----------------------------------") for x in files: print("(%d) t ||t%s" %(numFiles, x)) numFiles += 1 def GetFileName(index, files): # return the filename from the input folder return files[index] def Cls(): # clear the console screen os.system(["clear","cls"][platform.system()=="Windows"]) def GetFiles(): # prompt the user to enter files into the input folder while(len(os.listdir(INPUT_FILE_FOLDER)) < 1): print("** NOTE: To continue, please place the file(s) that you wish to "+ "nextract pages from inside the "%s" folder located in:" %(INPUT_FILE_FOLDER)) print("n%s%s" %(os.getcwd(), INPUT_FILE_FOLDER)) input("nPlease press ENTER to continue...") Cls() # clear the console screen def main(): # declare variables fileName = "" currFile = "" outfileName = "" input_pdfFile = "" output_pdfFile = PdfFileWriter() numPages = 0 numPagesInPDF = 0 pageRange = "" pageNumbers = [] removePage = [] errorPage = False initial = "" Cls() # CHECK TO SEE IF INPUT/OUTPUT FOLDERS EXIST, CREATE THEM IF THEY DONT DoesFolderExist(INPUT_FILE_FOLDER) DoesFolderExist(OUTPUT_FILE_FOLDER) # GET A FILE NAME FROM USER TO EXTRACT PAGES FROM while(fileName == ""): while(initial.isdigit() == False): # MAKE SURE THERE ARE FILES IN THE INPUT FOLDER if(len(os.listdir(INPUT_FILE_FOLDER)) < 1): GetFiles() print("nThese are the files thats currently located in "+ "the "%s" folder..n" %(INPUT_FILE_FOLDER)) # DISPLAY THE FILES THATS IN THE INPUT FOLDER TO THE SCREEN DisplayFiles(os.listdir(INPUT_FILE_FOLDER)) # GET THE FILE INDEX NUMBER FROM THE USER initial = input("nPlease enter the index number of the file that " +"you wish to extract pages from:n>> ") # CHECK IF THE USER ENTERED A DIGIT OR NOT if(initial.isdigit()): fileIndex = abs(int(initial)) # CHECK IF THE DIGIT IS WITHIN A VALID INDEX RANGE if((fileIndex > len(os.listdir(INPUT_FILE_FOLDER))) or (fileIndex <= 0)): print("nSorry, but "%d" is not a valid index number..." %(fileIndex)) input("nPlease press ENTER to continue...") Cls() # clear the console screen initial = "" fileIndex = "" # GET THE FILENAME FROM THE FOLDER else: currFile = GetFileName(fileIndex-1, os.listdir(INPUT_FILE_FOLDER)) else: print("nSorry, but "%s" is not a positive digit..." "nPlease enter positive digits only!" %(initial)) input("nPlease press ENTER to continue...") Cls() # CHECK IF ITS A PDF FILE if(currFile.endswith(".pdf")): fileName = currFile else: print("nSorry, but "%s" is not a pdf file!" %(currFile)) input("nPlease press ENTER to continue...") Cls() # clear the console screen initial = "" # GET INPUT FILE DOCUMENT INFO filePath = os.path.join(INPUT_FILE_FOLDER, fileName) input_pdfFile = PdfFileReader(open(filePath, "rb")) numPagesInPDF = input_pdfFile.getNumPages() # DISPLAY DIRECTIONS TO USER print("nCurrent file = "%s" and contains %d page(s)" %(currFile, numPagesInPDF)) print("nPlease enter the page numbers you wish to extract," +" separated by commas") print("Example: 1,2,5,56-100,241") pageRange = input(">> ") # REMOVE WHITESPACES FROM THE STRING pageRange = pageRange.replace(" ", "") print("nYou have selected to extract page(s): %s" %(pageRange)) print("n----------------------------------------------------------") # GET THE TOTAL NUMBER OF PAGES FROM THE USER AS SPECIFIED FROM ABOVE pageNumbers = GetPageNumbers(pageRange) # FIND ANY PAGE NUMBERS FROM THE LIST WHICH DONT EXIST IN THE FILE for x in pageNumbers: if((x > numPagesInPDF) or (x < 1)): errorPage = True removePage.append(x) # REMOVE ALL PAGE NUMBERS THAT DONT EXIST for x in removePage: pageNumbers.remove(x) # CHECK IF THERE ARE ANY VALID PAGES TO BE EXTRACTED FROM THE PDF FILE if(len(pageNumbers) < 1): print("n** ERROR: No pages have been selected to extract!n" +"Exiting...") input("nPlease press ENTER to continue...") sys.exit() # DISPLAY ERROR IF A PAGE NUMBER DOESNT EXIST IN PDF DOCUMENT elif(errorPage): print("n** ERROR: "%s" only contains %d page(s).nThe pages selected " "after page #%d cannot be extracted from "%s"" %(currFile,numPagesInPDF,numPagesInPDF,currFile)) print("nOnly page(s)",pageNumbers,"will be extracted from the file!") input("nPlease press ENTER to continue...") print("n----------------------------------------------------------") # START EXTRACTING PAGE NUMBERS # GET SELECTED PAGES FROM THE INPUT FILE for x in pageNumbers: output_pdfFile.addPage(input_pdfFile.getPage(x-1)) # CONSTRUCT OUTPUT FILENAME outfileName = fileName[:-4] +" - "+ OUTPUT_FILE_NAME while(DoesFileExists(outfileName, OUTPUT_FILE_FOLDER)): outfileName = outfileName[:-4] # remove ".pdf" # do this if there is already 2 copies of the outfile if(CheckOutFileDigits(outfileName)): outfileName = outfileName[:-1]# remove ")" count = int(outfileName[len(outfileName)-1]) # get num count += 1 outfileName = outfileName[:-1]# remove num outfileName += str(count)+")"# add new incremented num # do this if outfile exists only once else: outfileName += " ("+str(2)+")" outfileName += ".pdf" # GET THE NUMBER OF PAGES IN THE OUTPUT PDF FILE numPages = output_pdfFile.getNumPages() # SAVE OUTPUT FILE TO THE OUTPUT FOLDER now = datetime.datetime.now() time = str(now.strftime("CREATED: %m/%d/%Y, %I:%M:%S %p")) infoDict = output_pdfFile._info.getObject() infoDict.update({ NameObject('/Title'): createStringObject(outfileName), NameObject('/Author'): createStringObject(CURRENT_USER), NameObject('/Subject'): createStringObject(time), NameObject('/Creator'): createStringObject(PDF_PRODUCER) }) filePath = os.path.join(OUTPUT_FILE_FOLDER, outfileName) outputStream = open(filePath, "wb") output_pdfFile.write(outputStream) outputStream.close() # DISPLAY FINAL MESSAGE TO USER print("n"%s" has been created and contains %d total page(s)" %(outfileName, numPages)) print("nThis file is located in the following directory:n" +"n%s%s" %(os.getcwd(), OUTPUT_FILE_FOLDER)) input("nPlease press ENTER to continue...") if __name__ == "__main__": main() # http://programmingnotes.org/ |
QUICK NOTES:
The highlighted lines are sections of interest to look out for.
The code is heavily commented, so no further insight is necessary. If you have any questions, feel free to leave a comment below.
Click here to download a Windows executable file demonstrating the above use.
Leave a Reply