make a script that takes all pdf files in a folder and merges them to one file in python without using pypdf2 in python

Here's a Python script that takes all PDF files in a folder and merges them into a single file without using PyPDF2:

main.py
import os
from typing import List
from io import BytesIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


def get_files_in_directory(directory: str, extension: str = ".pdf") -> List[str]:
    """
    Get all filenames in a directory that end with a specific extension
    """
    file_list = []
    for file in os.listdir(directory):
        if file.endswith(extension):
            file_list.append(os.path.join(directory, file))
    return file_list


def merge_pdfs(directory: str, output_file: str) -> None:
    """
    Merge all PDF files in a directory into a single PDF file
    """
    with open(output_file, "wb") as f:
        for file in get_files_in_directory(directory):
            with open(file, "rb") as pdf_file:
                pdf_data = BytesIO(pdf_file.read())
                parser = PDFParser(pdf_data)
                doc = PDFDocument(parser)
                rsrcmgr = PDFResourceManager()
                device = TextConverter(rsrcmgr, f, laparams=LAParams())
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
                device.close()


# Example usage:
if __name__ == "__main__":
    directory = "path/to/pdf/folder"
    output_file = "path/to/output_file.pdf"
    merge_pdfs(directory, output_file)
1586 chars
46 lines

The merge_pdfs() function takes the path to the directory containing the PDF files, and the path to the output file. It opens each PDF file in the directory one by one, extracts the text content from each page and writes it into the output file.

Note: This approach only merges the text content of the PDF files, not the raw PDF contents. Hence, any images or complex PDF elements are not maintained.

gistlibby LogSnag