| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- from sklearn import model_selection
- from sklearn.linear_model import LogisticRegression
- from sklearn.model_selection import train_test_split
- # Load the training data
- def load_data(filepath):
- data = pd.read_csv(filepath)
- return data
- # Preprocess the data
- def preprocess_data(data):
- data = data.dropna() # Drop missing values
- data['text'] = data['text'].apply(clean_text) # Clean text data
-
- return data
- def clean_text(text):
- text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
- text = text.lower() # Convert to lowercase
- return text
- # Train the model
- def train_model(data):
- # Split the data
- X = data[['text', 'amount']]
- y = data['category']
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
- # Text preprocessing and feature extraction
- text_transformer = Pipeline(steps=[
- ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english')))
- ])
- # Amount scaling
- amount_transformer = Pipeline(steps=[
- ('scaler', StandardScaler())
- ])
-
- # Combine features
- preprocessor = ColumnTransformer(
- transformers=[
- ('text', text_transformer, 'text'),
- ('amount', amount_transformer, 'amount')
- ])
-
- # Create the model pipeline
- model = Pipeline(steps=[
- ('preprocessor', preprocessor),
- ('classifier', MultinomialNB())
- ])
-
- # Train the model
- model.fit(X_train, y_train)
-
- # Test the model
- y_pred = model.predict(X_test)
- accuracy = accuracy_score(y_test, y_pred)
- print(f'Model Accuracy: {accuracy}')
-
- return model
- # Predict categories for new data
- def predict_category(model, new_data_filepath):
- new_data = pd.read_csv(new_data_filepath)
- if new_data.ndim == 1: # Ensure the input is a DataFrame
- new_data = pd.DataFrame([new_data])
- predictions = model.predict(new_data)
- new_data['predicted_category'] = predictions
- return new_data
- if __name__ == "__main__":
- # Load and preprocess the data
- data_filepath = './books.cat/Learn.csv' # Replace with your training data CSV file path
- data = load_data(data_filepath)
- data = preprocess_data(data)
- data.info()
- print(data.shape)
- data.describe()
- data.corr()
- #data = preprocess_data(data)
-
- # Train the model
- #model = train_model(data)
-
- # Predict categories for new data
- #new_data_filepath = './books.cat/test.csv' # Replace with your new data CSV file path
- #predictions = predict_category(model, new_data_filepath)
-
- # Save predictions to a new CSV file
- #predictions.to_csv('./books.cat/predictions.csv', index=False)
- #print('Predictions saved to predictions.csv')
|