make a script that takes all pdf files in a folder and merges them to one file in python without using pypdf2 in python

Here's a Python script that takes all PDF files in a folder and merges them into a single file without using PyPDF2:


main.py
import os
from typing import List
from io import BytesIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


def get_files_in_directory(directory: str, extension: str = ".pdf") -> List[str]:
    """
    Get all filenames in a directory that end with a specific extension
    """
    file_list = []
    for file in os.listdir(directory):
        if file.endswith(extension):
            file_list.append(os.path.join(directory, file))
    return file_list


def merge_pdfs(directory: str, output_file: str) -> None:
    """
    Merge all PDF files in a directory into a single PDF file
    """
    with open(output_file, "wb") as f:
        for file in get_files_in_directory(directory):
            with open(file, "rb") as pdf_file:
                pdf_data = BytesIO(pdf_file.read())
                parser = PDFParser(pdf_data)
                doc = PDFDocument(parser)
                rsrcmgr = PDFResourceManager()
                device = TextConverter(rsrcmgr, f, laparams=LAParams())
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
                device.close()


# Example usage:
if __name__ == "__main__":
    directory = "path/to/pdf/folder"
    output_file = "path/to/output_file.pdf"
    merge_pdfs(directory, output_file)
1586 chars
46 lines

The merge_pdfs() function takes the path to the directory containing the PDF files, and the path to the output file. It opens each PDF file in the directory one by one, extracts the text content from each page and writes it into the output file.

Note: This approach only merges the text content of the PDF files, not the raw PDF contents. Hence, any images or complex PDF elements are not maintained.

similar python code snippets

make a script that takes all pdf files in a folder and merges them to one file in python

connect two dicts in python

merge two series in a dataframe with special function for each column in python

merge two series in a dataframe in python

merge a dataframe row values from another row values in python

merge a dictionnary with another dictionnary in python

merge two dictionnary with the same structure in python

merge two dataframe by a column in python

merge with the help of left join, and with the remaining dataframes - through inner. as a result of connecting dataframes in python

merge two dictionnary which have list as values in python

related categories