import os
import pandas as pd
import numpy as np
import PyPDF2
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
# PDF에서 데이터 추출 함수
def extract_text_from_pdfs(pdf_folder):
data = []
for file in os.listdir(pdf_folder):
if file.endswith(".pdf"):
with open(os.path.join(pdf_folder, file), "rb") as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
data.append(text)
return data
# 데이터 전처리 함수
def preprocess_data(text_data):
processed_data = []
for text in text_data:
lines = text.split("\n")
for line in lines:
values = line.split()
if len(values) > 1:
processed_data.append(values)
return pd.DataFrame(processed_data)
# LSTM 모델 생성 함수
def create_lstm_model(input_shape):
model = Sequential([
LSTM(50, activation='relu', return_sequences=True, input_shape=input_shape),
Dropout(0.2),
LSTM(50, activation='relu'),
Dropout(0.2),
Dense(1)
])
model.compile(optimizer='adam', loss='mse')
return model
# 예측 실행 함수
def train_and_predict_lstm(data, lookback=5):
data = np.array(data, dtype=np.float32)
data = data.reshape(-1, 1)
generator = TimeseriesGenerator(data, data, length=lookback, batch_size=1)
model = create_lstm_model((lookback, 1))
model.fit(generator, epochs=20, verbose=1)
predictions = model.predict(data[-lookback:].reshape(1, lookback, 1))
return predictions
# 실행 예시
pdf_folder = "./pdf_data" # PDF 파일이 저장된 폴더 경로
pdf_texts = extract_text_from_pdfs(pdf_folder)
df = preprocess_data(pdf_texts)
df.to_csv("processed_data.csv", index=False)
# 데이터 가공 및 예측 (예시로 1열의 데이터를 사용)
if df.shape[1] > 1:
target_column = df.iloc[:, 1].dropna().astype(float)
prediction = train_and_predict_lstm(target_column)
print("예측 결과:", prediction)