learn.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. import seaborn as sns
  5. from sklearn import model_selection
  6. from sklearn.linear_model import LogisticRegression
  7. from sklearn.model_selection import train_test_split
  8. # Load the training data
  9. def load_data(filepath):
  10. data = pd.read_csv(filepath)
  11. return data
  12. # Preprocess the data
  13. def preprocess_data(data):
  14. data = data.dropna() # Drop missing values
  15. data['text'] = data['text'].apply(clean_text) # Clean text data
  16. return data
  17. def clean_text(text):
  18. text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
  19. text = text.lower() # Convert to lowercase
  20. return text
  21. # Train the model
  22. def train_model(data):
  23. # Split the data
  24. X = data[['text', 'amount']]
  25. y = data['category']
  26. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  27. # Text preprocessing and feature extraction
  28. text_transformer = Pipeline(steps=[
  29. ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english')))
  30. ])
  31. # Amount scaling
  32. amount_transformer = Pipeline(steps=[
  33. ('scaler', StandardScaler())
  34. ])
  35. # Combine features
  36. preprocessor = ColumnTransformer(
  37. transformers=[
  38. ('text', text_transformer, 'text'),
  39. ('amount', amount_transformer, 'amount')
  40. ])
  41. # Create the model pipeline
  42. model = Pipeline(steps=[
  43. ('preprocessor', preprocessor),
  44. ('classifier', MultinomialNB())
  45. ])
  46. # Train the model
  47. model.fit(X_train, y_train)
  48. # Test the model
  49. y_pred = model.predict(X_test)
  50. accuracy = accuracy_score(y_test, y_pred)
  51. print(f'Model Accuracy: {accuracy}')
  52. return model
  53. # Predict categories for new data
  54. def predict_category(model, new_data_filepath):
  55. new_data = pd.read_csv(new_data_filepath)
  56. if new_data.ndim == 1: # Ensure the input is a DataFrame
  57. new_data = pd.DataFrame([new_data])
  58. predictions = model.predict(new_data)
  59. new_data['predicted_category'] = predictions
  60. return new_data
  61. if __name__ == "__main__":
  62. # Load and preprocess the data
  63. data_filepath = './books.cat/Learn.csv' # Replace with your training data CSV file path
  64. data = load_data(data_filepath)
  65. data = preprocess_data(data)
  66. data.info()
  67. print(data.shape)
  68. data.describe()
  69. data.corr()
  70. #data = preprocess_data(data)
  71. # Train the model
  72. #model = train_model(data)
  73. # Predict categories for new data
  74. #new_data_filepath = './books.cat/test.csv' # Replace with your new data CSV file path
  75. #predictions = predict_category(model, new_data_filepath)
  76. # Save predictions to a new CSV file
  77. #predictions.to_csv('./books.cat/predictions.csv', index=False)
  78. #print('Predictions saved to predictions.csv')