Python || Pdf Split & Extract Using PyPdf

August 04, 2013 admin No comments
The following is a simple pdf file split & extractor program which utilizes the “pyPdf” library to manipulate pdf files. This program has the ability to extract selected pages from an existing pdf file, and save the extracted pages into a new pdf file.
REQUIRED KNOWLEDGE FOR THIS PROGRAM
PyPdf - What Is It? How To Create Executable Python Programs Display The Time In Python Metadata With PyPdf Pdf Split Executable File - Click Here To Download
This program first asks the user to place the pdf file(s) they wish to extract pages from into a specified folder. The default input folder is titled “Files To Extract.” After the input pdf file(s) have been placed into the specified input folder, the program prompts the user to select which file they wish to extract pages from. As soon as an input pdf file has been selected, the user is asked to enter in the page numbers they wish to extract from the specified input pdf file. After the page extraction is completed, the selected pages are merged into one single pdf file, and is saved into an output folder titled “Completed Extracted Files.”




		
		
			Pdf Split & Extract Using PyPdf
			
Python
			
			# =============================================================================
#   Author: K Perkins
#   Date:  Aug 4, 2013
#   Taken From: http://programmingnotes.org/
#   File: PdfSplit.py
#   Description: This is a simple program utilizing the pyPdf library to
#      manipulate pdf files. This program has the ability to extract selected
#      pages from an existing pdf file, and save the extracted pages into
#      a new pdf file.
# =============================================================================
import sys, os, datetime, platform
from pyPdf.pdf import PdfFileWriter, PdfFileReader
from pyPdf.generic import NameObject, createStringObject

# ---- START GLOBAL VARIABLES ---- #

INPUT_FILE_FOLDER = "Files To Extract"
OUTPUT_FILE_FOLDER = "Completed Extracted Files"
OUTPUT_FILE_NAME = "Extracted File.pdf"
PDF_PRODUCER = "KENNETH'S PDF EXTRACTOR"

# Determine the platform
if(platform.system() == "Windows"):
    CURRENT_USER = os.environ.get("USERNAME")
else:
    CURRENT_USER = os.environ.get("USER")

# ---- END GLOBAL VARIABLES ---- #

def DoesFileExists(fileName, fileFolder):
    # determine if a file exists
    try:
       filePath = os.path.join(fileFolder, fileName)
       with open(filePath) as f: pass
       return True
    except IOError as e:
       return False

def DoesFolderExist(fileFolder):
    # determine if a folder exists 
    if not os.path.exists(fileFolder):
       os.makedirs(fileFolder)
       return False
    return True

def CheckOutFileDigits(outfile):
    # check if a file already exists in a folder
    char = outfile[len(outfile)-1]
    if(char == ")"):
       return True
    return False

def GetPageNumbers(pageRange):
    # parse a page range (i.e: 1,2,5,56-100,241) and return its 
    # integer equivalent
    pageIndex = 0
    inDigit = False
    inNums2 = False
    nums = ""
    nums2 = ""
    pageNumbers = []

    while(pageIndex < len(pageRange)):
        if(pageRange[pageIndex].isdigit()):
            inDigit = True
        else:
            inDigit = False

        if(inDigit):
            if(inNums2 == False):
                nums += pageRange[pageIndex]
            else:
                nums2 += pageRange[pageIndex]
        else:
            if(nums != "" and pageRange[pageIndex] == "," and inNums2 == False):
                pageNumbers.append(int(nums))
                nums = ""
            elif(nums != "" and pageRange[pageIndex] == "-"):
                inNums2 = True
            elif(nums2 != "" and inNums2):
                for x in range(int(nums), int(nums2)+1):
                    pageNumbers.append(x)
                nums = ""
                nums2 = ""
                inNums2 = False
            elif((nums != "" and pageRange[pageIndex] != ",")
                or (nums != "" and pageRange[pageIndex] != "-")):
                pageNumbers.append(int(nums))
                nums = ""
        pageIndex += 1

    # DO THIS IF NUMBERS ARE LEFT OVER FROM THE ABOVE LOOP ^
    if(nums != "" and nums2 != ""):
        for x in range(int(nums), int(nums2)+1):
            pageNumbers.append(x)
    elif(nums != ""):
        pageNumbers.append(int(nums))

    return pageNumbers

def DisplayFiles(files):
    # display files in a folder
    numFiles = 1
    print("Index #   ||tFile Namen"+
       "-----------------------------------")
    for x in files:
       print("(%d) t  ||t%s" %(numFiles, x))
       numFiles += 1

def GetFileName(index, files):
    # return the filename from the input folder
    return files[index]

def Cls():
    # clear the console screen
    os.system(["clear","cls"][platform.system()=="Windows"])

def GetFiles():
    # prompt the user to enter files into the input folder 
    while(len(os.listdir(INPUT_FILE_FOLDER)) < 1):
        print("** NOTE: To continue, please place the file(s) that you wish to "+
            "nextract pages from inside the "%s" folder located in:"
            %(INPUT_FILE_FOLDER))
        print("n%s%s" %(os.getcwd(), INPUT_FILE_FOLDER))
        input("nPlease press ENTER to continue...")
        Cls() # clear the console screen

def main():
    # declare variables
    fileName = ""
    currFile = ""
    outfileName = ""
    input_pdfFile = ""
    output_pdfFile = PdfFileWriter()
    numPages = 0
    numPagesInPDF = 0
    pageRange = ""
    pageNumbers = []
    removePage = []
    errorPage = False
    initial = ""

    Cls()
    # CHECK TO SEE IF INPUT/OUTPUT FOLDERS EXIST, CREATE THEM IF THEY DONT
    DoesFolderExist(INPUT_FILE_FOLDER)
    DoesFolderExist(OUTPUT_FILE_FOLDER)

    # GET A FILE NAME FROM USER TO EXTRACT PAGES FROM
    while(fileName == ""):
        while(initial.isdigit() == False):
            # MAKE SURE THERE ARE FILES IN THE INPUT FOLDER
            if(len(os.listdir(INPUT_FILE_FOLDER)) < 1):
                GetFiles()

            print("nThese are the files thats currently located in "+
                "the "%s" folder..n" %(INPUT_FILE_FOLDER))

            # DISPLAY THE FILES THATS IN THE INPUT FOLDER TO THE SCREEN
            DisplayFiles(os.listdir(INPUT_FILE_FOLDER))

            # GET THE FILE INDEX NUMBER FROM THE USER
            initial = input("nPlease enter the index number of the file that "
                +"you wish to extract pages from:n>> ")

            # CHECK IF THE USER ENTERED A DIGIT OR NOT
            if(initial.isdigit()):
                fileIndex = abs(int(initial))

                # CHECK IF THE DIGIT IS WITHIN A VALID INDEX RANGE
                if((fileIndex > len(os.listdir(INPUT_FILE_FOLDER)))
                   or (fileIndex <= 0)):
                    print("nSorry, but "%d" is not a valid index number..."
                      %(fileIndex))
                    input("nPlease press ENTER to continue...")
                    Cls() # clear the console screen
                    initial = ""
                    fileIndex = ""
                # GET THE FILENAME FROM THE FOLDER
                else:
                    currFile = GetFileName(fileIndex-1, os.listdir(INPUT_FILE_FOLDER))

            else:
                print("nSorry, but "%s" is not a positive digit..."
                  "nPlease enter positive digits only!" %(initial))
                input("nPlease press ENTER to continue...")
                Cls()

        # CHECK IF ITS A PDF FILE
        if(currFile.endswith(".pdf")):
            fileName = currFile
        else:
            print("nSorry, but "%s" is not a pdf file!" %(currFile))
            input("nPlease press ENTER to continue...")
            Cls() # clear the console screen
            initial = ""

    # GET INPUT FILE DOCUMENT INFO
    filePath = os.path.join(INPUT_FILE_FOLDER, fileName)
    input_pdfFile = PdfFileReader(open(filePath, "rb"))
    numPagesInPDF = input_pdfFile.getNumPages()

    # DISPLAY DIRECTIONS TO USER
    print("nCurrent file = "%s" and contains %d page(s)"
      %(currFile, numPagesInPDF))
    print("nPlease enter the page numbers you wish to extract,"
      +" separated by commas")
    print("Example: 1,2,5,56-100,241")
    pageRange = input(">> ")

    # REMOVE WHITESPACES FROM THE STRING
    pageRange = pageRange.replace(" ", "")

    print("nYou have selected to extract page(s): %s" %(pageRange))

    print("n----------------------------------------------------------")

    # GET THE TOTAL NUMBER OF PAGES FROM THE USER AS SPECIFIED FROM ABOVE
    pageNumbers = GetPageNumbers(pageRange)

    # FIND ANY PAGE NUMBERS FROM THE LIST WHICH DONT EXIST IN THE FILE
    for x in pageNumbers:
        if((x > numPagesInPDF) or (x < 1)):
            errorPage = True
            removePage.append(x)

    # REMOVE ALL PAGE NUMBERS THAT DONT EXIST
    for x in removePage:
        pageNumbers.remove(x)

    # CHECK IF THERE ARE ANY VALID PAGES TO BE EXTRACTED FROM THE PDF FILE
    if(len(pageNumbers) < 1):
        print("n** ERROR: No pages have been selected to extract!n"
         +"Exiting...")
        input("nPlease press ENTER to continue...")
        sys.exit()

    # DISPLAY ERROR IF A PAGE NUMBER DOESNT EXIST IN PDF DOCUMENT
    elif(errorPage):
        print("n** ERROR: "%s" only contains %d page(s).nThe pages selected "
            "after page #%d cannot be extracted from "%s""
            %(currFile,numPagesInPDF,numPagesInPDF,currFile))
        print("nOnly page(s)",pageNumbers,"will be extracted from the file!")
        input("nPlease press ENTER to continue...")
        print("n----------------------------------------------------------")

    # START EXTRACTING PAGE NUMBERS
    # GET SELECTED PAGES FROM THE INPUT FILE
    for x in pageNumbers:
        output_pdfFile.addPage(input_pdfFile.getPage(x-1))

    # CONSTRUCT OUTPUT FILENAME
    outfileName = fileName[:-4] +" - "+ OUTPUT_FILE_NAME
    while(DoesFileExists(outfileName, OUTPUT_FILE_FOLDER)):
        outfileName = outfileName[:-4] # remove ".pdf"
        # do this if there is already 2 copies of the outfile
        if(CheckOutFileDigits(outfileName)):
            outfileName = outfileName[:-1]# remove ")"
            count = int(outfileName[len(outfileName)-1]) # get num
            count += 1
            outfileName = outfileName[:-1]# remove num
            outfileName += str(count)+")"# add new incremented num
        # do this if outfile exists only once
        else:
            outfileName += " ("+str(2)+")"
        outfileName += ".pdf"

    # GET THE NUMBER OF PAGES IN THE OUTPUT PDF FILE
    numPages = output_pdfFile.getNumPages()

    # SAVE OUTPUT FILE TO THE OUTPUT FOLDER
    now = datetime.datetime.now()
    time = str(now.strftime("CREATED: %m/%d/%Y, %I:%M:%S %p"))
    infoDict = output_pdfFile._info.getObject()

    infoDict.update({
       NameObject('/Title'): createStringObject(outfileName),
       NameObject('/Author'): createStringObject(CURRENT_USER),
       NameObject('/Subject'): createStringObject(time),
       NameObject('/Creator'): createStringObject(PDF_PRODUCER)
    })

    filePath = os.path.join(OUTPUT_FILE_FOLDER, outfileName)
    outputStream = open(filePath, "wb")
    output_pdfFile.write(outputStream)
    outputStream.close()

    # DISPLAY FINAL MESSAGE TO USER
    print("n"%s" has been created and contains %d total page(s)"
      %(outfileName, numPages))
    print("nThis file is located in the following directory:n"
      +"n%s%s" %(os.getcwd(), OUTPUT_FILE_FOLDER))
    input("nPlease press ENTER to continue...")

if __name__ == "__main__":
    main()
# http://programmingnotes.org/
			
				
					
				
					1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
				
						# =============================================================================
#   Author: K Perkins
#   Date:  Aug 4, 2013
#   Taken From: http://programmingnotes.org/
#   File: PdfSplit.py
#   Description: This is a simple program utilizing the pyPdf library to
#      manipulate pdf files. This program has the ability to extract selected
#      pages from an existing pdf file, and save the extracted pages into
#      a new pdf file.
# =============================================================================
import sys, os, datetime, platform
from pyPdf.pdf import PdfFileWriter, PdfFileReader
from pyPdf.generic import NameObject, createStringObject
 
# ---- START GLOBAL VARIABLES ---- #
 
INPUT_FILE_FOLDER = "Files To Extract"
OUTPUT_FILE_FOLDER = "Completed Extracted Files"
OUTPUT_FILE_NAME = "Extracted File.pdf"
PDF_PRODUCER = "KENNETH'S PDF EXTRACTOR"
 
# Determine the platform
if(platform.system() == "Windows"):
    CURRENT_USER = os.environ.get("USERNAME")
else:
    CURRENT_USER = os.environ.get("USER")
 
# ---- END GLOBAL VARIABLES ---- #
 
def DoesFileExists(fileName, fileFolder):
    # determine if a file exists
    try:
       filePath = os.path.join(fileFolder, fileName)
       with open(filePath) as f: pass
       return True
    except IOError as e:
       return False
 
def DoesFolderExist(fileFolder):
    # determine if a folder exists 
    if not os.path.exists(fileFolder):
       os.makedirs(fileFolder)
       return False
    return True
 
def CheckOutFileDigits(outfile):
    # check if a file already exists in a folder
    char = outfile[len(outfile)-1]
    if(char == ")"):
       return True
    return False
 
def GetPageNumbers(pageRange):
    # parse a page range (i.e: 1,2,5,56-100,241) and return its 
    # integer equivalent
    pageIndex = 0
    inDigit = False
    inNums2 = False
    nums = ""
    nums2 = ""
    pageNumbers = []
 
    while(pageIndex < len(pageRange)):
        if(pageRange[pageIndex].isdigit()):
            inDigit = True
        else:
            inDigit = False
 
        if(inDigit):
            if(inNums2 == False):
                nums += pageRange[pageIndex]
            else:
                nums2 += pageRange[pageIndex]
        else:
            if(nums != "" and pageRange[pageIndex] == "," and inNums2 == False):
                pageNumbers.append(int(nums))
                nums = ""
            elif(nums != "" and pageRange[pageIndex] == "-"):
                inNums2 = True
            elif(nums2 != "" and inNums2):
                for x in range(int(nums), int(nums2)+1):
                    pageNumbers.append(x)
                nums = ""
                nums2 = ""
                inNums2 = False
            elif((nums != "" and pageRange[pageIndex] != ",")
                or (nums != "" and pageRange[pageIndex] != "-")):
                pageNumbers.append(int(nums))
                nums = ""
        pageIndex += 1
 
    # DO THIS IF NUMBERS ARE LEFT OVER FROM THE ABOVE LOOP ^
    if(nums != "" and nums2 != ""):
        for x in range(int(nums), int(nums2)+1):
            pageNumbers.append(x)
    elif(nums != ""):
        pageNumbers.append(int(nums))
 
    return pageNumbers
 
def DisplayFiles(files):
    # display files in a folder
    numFiles = 1
    print("Index #   ||tFile Namen"+
       "-----------------------------------")
    for x in files:
       print("(%d) t  ||t%s" %(numFiles, x))
       numFiles += 1
 
def GetFileName(index, files):
    # return the filename from the input folder
    return files[index]
 
def Cls():
    # clear the console screen
    os.system(["clear","cls"][platform.system()=="Windows"])
 
def GetFiles():
    # prompt the user to enter files into the input folder 
    while(len(os.listdir(INPUT_FILE_FOLDER)) < 1):
        print("** NOTE: To continue, please place the file(s) that you wish to "+
            "nextract pages from inside the "%s" folder located in:"
            %(INPUT_FILE_FOLDER))
        print("n%s%s" %(os.getcwd(), INPUT_FILE_FOLDER))
        input("nPlease press ENTER to continue...")
        Cls() # clear the console screen
 
def main():
    # declare variables
    fileName = ""
    currFile = ""
    outfileName = ""
    input_pdfFile = ""
    output_pdfFile = PdfFileWriter()
    numPages = 0
    numPagesInPDF = 0
    pageRange = ""
    pageNumbers = []
    removePage = []
    errorPage = False
    initial = ""
 
    Cls()
    # CHECK TO SEE IF INPUT/OUTPUT FOLDERS EXIST, CREATE THEM IF THEY DONT
    DoesFolderExist(INPUT_FILE_FOLDER)
    DoesFolderExist(OUTPUT_FILE_FOLDER)
 
    # GET A FILE NAME FROM USER TO EXTRACT PAGES FROM
    while(fileName == ""):
        while(initial.isdigit() == False):
            # MAKE SURE THERE ARE FILES IN THE INPUT FOLDER
            if(len(os.listdir(INPUT_FILE_FOLDER)) < 1):
                GetFiles()
 
            print("nThese are the files thats currently located in "+
                "the "%s" folder..n" %(INPUT_FILE_FOLDER))
 
            # DISPLAY THE FILES THATS IN THE INPUT FOLDER TO THE SCREEN
            DisplayFiles(os.listdir(INPUT_FILE_FOLDER))
 
            # GET THE FILE INDEX NUMBER FROM THE USER
            initial = input("nPlease enter the index number of the file that "
                +"you wish to extract pages from:n>> ")
 
            # CHECK IF THE USER ENTERED A DIGIT OR NOT
            if(initial.isdigit()):
                fileIndex = abs(int(initial))
 
                # CHECK IF THE DIGIT IS WITHIN A VALID INDEX RANGE
                if((fileIndex > len(os.listdir(INPUT_FILE_FOLDER)))
                   or (fileIndex <= 0)):
                    print("nSorry, but "%d" is not a valid index number..."
                      %(fileIndex))
                    input("nPlease press ENTER to continue...")
                    Cls() # clear the console screen
                    initial = ""
                    fileIndex = ""
                # GET THE FILENAME FROM THE FOLDER
                else:
                    currFile = GetFileName(fileIndex-1, os.listdir(INPUT_FILE_FOLDER))
 
            else:
                print("nSorry, but "%s" is not a positive digit..."
                  "nPlease enter positive digits only!" %(initial))
                input("nPlease press ENTER to continue...")
                Cls()
 
        # CHECK IF ITS A PDF FILE
        if(currFile.endswith(".pdf")):
            fileName = currFile
        else:
            print("nSorry, but "%s" is not a pdf file!" %(currFile))
            input("nPlease press ENTER to continue...")
            Cls() # clear the console screen
            initial = ""
 
    # GET INPUT FILE DOCUMENT INFO
    filePath = os.path.join(INPUT_FILE_FOLDER, fileName)
    input_pdfFile = PdfFileReader(open(filePath, "rb"))
    numPagesInPDF = input_pdfFile.getNumPages()
 
    # DISPLAY DIRECTIONS TO USER
    print("nCurrent file = "%s" and contains %d page(s)"
      %(currFile, numPagesInPDF))
    print("nPlease enter the page numbers you wish to extract,"
      +" separated by commas")
    print("Example: 1,2,5,56-100,241")
    pageRange = input(">> ")
 
    # REMOVE WHITESPACES FROM THE STRING
    pageRange = pageRange.replace(" ", "")
 
    print("nYou have selected to extract page(s): %s" %(pageRange))
 
    print("n----------------------------------------------------------")
 
    # GET THE TOTAL NUMBER OF PAGES FROM THE USER AS SPECIFIED FROM ABOVE
    pageNumbers = GetPageNumbers(pageRange)
 
    # FIND ANY PAGE NUMBERS FROM THE LIST WHICH DONT EXIST IN THE FILE
    for x in pageNumbers:
        if((x > numPagesInPDF) or (x < 1)):
            errorPage = True
            removePage.append(x)
 
    # REMOVE ALL PAGE NUMBERS THAT DONT EXIST
    for x in removePage:
        pageNumbers.remove(x)
 
    # CHECK IF THERE ARE ANY VALID PAGES TO BE EXTRACTED FROM THE PDF FILE
    if(len(pageNumbers) < 1):
        print("n** ERROR: No pages have been selected to extract!n"
         +"Exiting...")
        input("nPlease press ENTER to continue...")
        sys.exit()
 
    # DISPLAY ERROR IF A PAGE NUMBER DOESNT EXIST IN PDF DOCUMENT
    elif(errorPage):
        print("n** ERROR: "%s" only contains %d page(s).nThe pages selected "
            "after page #%d cannot be extracted from "%s""
            %(currFile,numPagesInPDF,numPagesInPDF,currFile))
        print("nOnly page(s)",pageNumbers,"will be extracted from the file!")
        input("nPlease press ENTER to continue...")
        print("n----------------------------------------------------------")
 
    # START EXTRACTING PAGE NUMBERS
    # GET SELECTED PAGES FROM THE INPUT FILE
    for x in pageNumbers:
        output_pdfFile.addPage(input_pdfFile.getPage(x-1))
 
    # CONSTRUCT OUTPUT FILENAME
    outfileName = fileName[:-4] +" - "+ OUTPUT_FILE_NAME
    while(DoesFileExists(outfileName, OUTPUT_FILE_FOLDER)):
        outfileName = outfileName[:-4] # remove ".pdf"
        # do this if there is already 2 copies of the outfile
        if(CheckOutFileDigits(outfileName)):
            outfileName = outfileName[:-1]# remove ")"
            count = int(outfileName[len(outfileName)-1]) # get num
            count += 1
            outfileName = outfileName[:-1]# remove num
            outfileName += str(count)+")"# add new incremented num
        # do this if outfile exists only once
        else:
            outfileName += " ("+str(2)+")"
        outfileName += ".pdf"
 
    # GET THE NUMBER OF PAGES IN THE OUTPUT PDF FILE
    numPages = output_pdfFile.getNumPages()
 
    # SAVE OUTPUT FILE TO THE OUTPUT FOLDER
    now = datetime.datetime.now()
    time = str(now.strftime("CREATED: %m/%d/%Y, %I:%M:%S %p"))
    infoDict = output_pdfFile._info.getObject()
 
    infoDict.update({
       NameObject('/Title'): createStringObject(outfileName),
       NameObject('/Author'): createStringObject(CURRENT_USER),
       NameObject('/Subject'): createStringObject(time),
       NameObject('/Creator'): createStringObject(PDF_PRODUCER)
    })
 
    filePath = os.path.join(OUTPUT_FILE_FOLDER, outfileName)
    outputStream = open(filePath, "wb")
    output_pdfFile.write(outputStream)
    outputStream.close()
 
    # DISPLAY FINAL MESSAGE TO USER
    print("n"%s" has been created and contains %d total page(s)"
      %(outfileName, numPages))
    print("nThis file is located in the following directory:n"
      +"n%s%s" %(os.getcwd(), OUTPUT_FILE_FOLDER))
    input("nPlease press ENTER to continue...")
 
if __name__ == "__main__":
    main()
# http://programmingnotes.org/
QUICK NOTES:
The highlighted lines are sections of interest to look out for.
The code is heavily commented, so no further insight is necessary. If you have any questions, feel free to leave a comment below.
Click here to download a Windows executable file demonstrating the above use.
Was this article helpful?
👍 YesNo
Posted in: Console, Python Tags: pdf split, pypdf, python
Python || Pdf Split & Extract Using PyPdf

Related

Leave a ReplyCancel reply

Search

Compile Your Code Now

Translate Page

Popular Pages

Recent

Archives

Categories