직장인

[회사원] 업무 코딩기록 - (Fitz, Pandas) pdf에서 표 추출(.csv)

cyy1211 2024. 12. 20. 15:25
728x90
반응형
# import package PyMuPDF and pandas
import fitz  
import pandas as pd
import os

# Define input and output directories
input_folder = r"C:\Users\user\Desktop\VSCode(python)\(CODE)PDF,OCR\pdf_raw"
output_folder = r"C:\Users\user\Desktop\VSCode(python)\(CODE)PDF,OCR\pdf_result"

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Open the PDF document
doc = fitz.open(os.path.join(input_folder, "test.pdf"))

# Load a desired page. This works via 0-based numbers
page = doc[0]

# Look for tables on this page and display the table count
tabs = page.find_tables()
print(f"{len(tabs.tables)} table(s) on {page}")

# Check if there are any tables found
if len(tabs.tables) > 0:
    # Select the first table
    tab = tabs[0]

    # Convert the table to a pandas DataFrame
    df = tab.to_pandas()
    print("Table:")
    print(df)

    # Save the DataFrame to a CSV file in the output folder
    output_file = os.path.join(output_folder, "extracted_table.csv")
    df.to_csv(output_file, index=False)
    print(f"Table saved to {output_file}")
else:
    print("No tables found on the page.")

# Close the document
doc.close()
728x90
반응형