asterisk
/
books


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
							import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


# Load the training data
def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

# Preprocess the data
def preprocess_data(data):
    data = data.dropna()  # Drop missing values
    data['text'] = data['text'].apply(clean_text)  # Clean text data
    
    return data

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Train the model
def train_model(data):
    # Split the data
    X = data[['text', 'amount']]
    y = data['category']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Text preprocessing and feature extraction
    text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english')))
    ])

    # Amount scaling
    amount_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    # Combine features
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_transformer, 'text'),
            ('amount', amount_transformer, 'amount')
        ])
    
    # Create the model pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', MultinomialNB())
    ])
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Test the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Model Accuracy: {accuracy}')
    
    return model

# Predict categories for new data
def predict_category(model, new_data_filepath):
    new_data = pd.read_csv(new_data_filepath)
    if new_data.ndim == 1:  # Ensure the input is a DataFrame
        new_data = pd.DataFrame([new_data])
    predictions = model.predict(new_data)
    new_data['predicted_category'] = predictions
    return new_data

if __name__ == "__main__":
    # Load and preprocess the data
    data_filepath = './books.cat/Learn.csv'  # Replace with your training data CSV file path
    data = load_data(data_filepath)

    data = preprocess_data(data)
    data.info()
    print(data.shape)
    data.describe()
    data.corr()
    #data = preprocess_data(data)
    
    # Train the model
    #model = train_model(data)
    
    # Predict categories for new data
    #new_data_filepath = './books.cat/test.csv'  # Replace with your new data CSV file path
    #predictions = predict_category(model, new_data_filepath)
    
    # Save predictions to a new CSV file
    #predictions.to_csv('./books.cat/predictions.csv', index=False)
    #print('Predictions saved to predictions.csv')