asterisk
/
books


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
							import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
# Custom transformer to reshape data
class ReshapeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.values.reshape(-1, 1)

# Load the dataset
df = pd.read_csv('Learn.csv')  # assuming the file name is transactions.csv

# Display the first few rows of the dataset
print(df.head())

# Split data into features and target
X = df[['merchant_name', 'amount']]
y = df['category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing for both text and numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('merchant_tfidf', TfidfVectorizer(), 'merchant_name'),
        ('amount_scaler', Pipeline([
            ('reshape', ReshapeTransformer()),
            ('scaler', StandardScaler())
        ]), 'amount')
    ]
)

# Create the full pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)


# Save the model to a file
joblib.dump(pipeline, 'model_pipeline.pkl')
print("Model saved successfully.")

# Load the model from the file
loaded_pipeline = joblib.load('model_pipeline.pkl')
print("Model loaded successfully.")


if(len(sys.argv) > 1):
    pred = loaded_pipeline.predict(pd.DataFrame({
        'merchant_name' : [sys.argv[1]],
        'amount' : ['5300']
    }))
    
    print (f"Merchant {sys.argv[1]} -> {pred}")

# Print the classification report
print(classification_report(y_test, y_pred, zero_division=0))