Data Science Asked on December 24, 2020
I have a dataset already divided into train, test and validation set. How can I insert the validation in my pipeline?
Code:
class SVMSentiment(Base):
"""Predict sentiment scores using a linear Support Vector Machine (SVM).
Uses a sklearn pipeline.
"""
def __init__(self, model_file: str=None) -> None:
super().__init__()
# pip install sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
self.pipeline = Pipeline(
[
#('vect', CountVectorizer()),
#('tfidf', TfidfTransformer()),
('tfidf', TfidfVectorizer()),
#('clf', LinearSVC(
#loss='hinge',
('clf', SGDClassifier(
loss='hinge',
penalty='l2',
alpha=1e-4,
random_state=42,
max_iter=100,
learning_rate='optimal',
tol=None,
)),
]
)
def predict(self, train_file: str, test_file: str, lower_case: bool) -> pd.DataFrame:
"Train model using sklearn pipeline"
train_df = self.read_data(train_file, lower_case)
dev_df = self.read_data(dev_file, lower_case)
learner = self.pipeline.fit(train_df['text'], train_df['truth'])
# Fit the learner to the test data
test_df = self.read_data(test_file, lower_case)
test_df['pred'] = learner.predict(test_df['text'])
return test_df
I don’t understand where I should include it since the data is already splitted.
Edit: more on the predict
method, I see dev
is not contemplated here:
def run_classifier(files: Tuple[str, str, str],
method: str,
method_class: Base,
model_file: str,
lower_case: bool) -> None:
"Inherit classes from classifiers.py and apply the predict/accuracy methods"
train, dev, test = files # Unpack train, dev and test filenames
result = method_class.predict(train, test, lower_case)
method_class.accuracy(result)
# Plot confusion matrix
make_dirs("Plots")
print(result)
fig, ax = plot_confusion_matrix(result['truth'], result['pred'], normalize=True)
ax.set_title("Normalized Confusion Matrix: {}".format(method.title()))
fig.tight_layout()
fig.savefig("Plots/{}.png".format(method))
Editing my answer -
def predict(self, train_file: str, test_file: str, lower_case: bool) -> pd.DataFrame:
"Train model using sklearn pipeline"
train_df = self.read_data(train_file, lower_case)
dev_df = self.read_data(dev_file, lower_case)
learner = self.pipeline.fit(train_df['text'], train_df['truth'])
# Fit the learner to the test data
test_df = self.read_data(test_file, lower_case)
# assuming dev_df is validset
dev_df['pred'] = learner.predict(dev_df['text'])
test_df['pred'] = learner.predict(test_df['text'])
return test_df, dev_df
use two variables when you are calling the predict function to store tes and dev results
Answered by Madhur Yadav on December 24, 2020
Get help from others!
Recent Answers
Recent Questions
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP