| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import sys
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.preprocessing import StandardScaler
- from sklearn.compose import ColumnTransformer
- from sklearn.pipeline import Pipeline
- from sklearn.linear_model import LogisticRegression
- from sklearn.metrics import classification_report
- from sklearn.base import BaseEstimator, TransformerMixin
- import joblib
- # Custom transformer to reshape data
- class ReshapeTransformer(BaseEstimator, TransformerMixin):
- def fit(self, X, y=None):
- return self
-
- def transform(self, X):
- return X.values.reshape(-1, 1)
- # Load the dataset
- df = pd.read_csv('Learn.csv') # assuming the file name is transactions.csv
- # Display the first few rows of the dataset
- print(df.head())
- # Split data into features and target
- X = df[['merchant_name', 'amount']]
- y = df['category']
- # Split the data into training and testing sets
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
- # Define the preprocessing for both text and numerical data
- preprocessor = ColumnTransformer(
- transformers=[
- ('merchant_tfidf', TfidfVectorizer(), 'merchant_name'),
- ('amount_scaler', Pipeline([
- ('reshape', ReshapeTransformer()),
- ('scaler', StandardScaler())
- ]), 'amount')
- ]
- )
- # Create the full pipeline with preprocessing and model
- pipeline = Pipeline([
- ('preprocessor', preprocessor),
- ('classifier', LogisticRegression(max_iter=1000))
- ])
- # Train the model
- pipeline.fit(X_train, y_train)
- # Predict on the test set
- y_pred = pipeline.predict(X_test)
- # Save the model to a file
- joblib.dump(pipeline, 'model_pipeline.pkl')
- print("Model saved successfully.")
- # Load the model from the file
- loaded_pipeline = joblib.load('model_pipeline.pkl')
- print("Model loaded successfully.")
- if(len(sys.argv) > 1):
- pred = loaded_pipeline.predict(pd.DataFrame({
- 'merchant_name' : [sys.argv[1]],
- 'amount' : ['5300']
- }))
-
- print (f"Merchant {sys.argv[1]} -> {pred}")
- # Print the classification report
- print(classification_report(y_test, y_pred, zero_division=0))
|