TransWikia.com

How to include validation set in the pipeline to tune parameters for an SVM?

Data Science Asked on December 24, 2020

I have a dataset already divided into train, test and validation set. How can I insert the validation in my pipeline?

Code:

class SVMSentiment(Base):
"""Predict sentiment scores using a linear Support Vector Machine (SVM).
Uses a sklearn pipeline.
"""
def __init__(self, model_file: str=None) -> None:
    super().__init__()
    # pip install sklearn
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
    from sklearn.linear_model import SGDClassifier
    from sklearn.svm import LinearSVC
    from sklearn.pipeline import Pipeline
    self.pipeline = Pipeline(
        [
            #('vect', CountVectorizer()),
            #('tfidf', TfidfTransformer()),
            ('tfidf', TfidfVectorizer()),
            #('clf', LinearSVC(
                #loss='hinge',

            ('clf', SGDClassifier(
                loss='hinge',
                penalty='l2',
                alpha=1e-4,
                random_state=42,
                max_iter=100,
                learning_rate='optimal',
                tol=None,

            )),
        ]
    )

def predict(self, train_file: str, test_file: str, lower_case: bool) -> pd.DataFrame:
    "Train model using sklearn pipeline"
    train_df = self.read_data(train_file, lower_case)
    dev_df = self.read_data(dev_file, lower_case)
    learner = self.pipeline.fit(train_df['text'], train_df['truth'])
    # Fit the learner to the test data
    test_df = self.read_data(test_file, lower_case)
    test_df['pred'] = learner.predict(test_df['text'])
    return test_df

I don’t understand where I should include it since the data is already splitted.

Edit: more on the predict method, I see dev is not contemplated here:

def run_classifier(files: Tuple[str, str, str],
               method: str,
               method_class: Base,
               model_file: str,
               lower_case: bool) -> None:
"Inherit classes from classifiers.py and apply the predict/accuracy methods"
train, dev, test = files  # Unpack train, dev and test filenames
result = method_class.predict(train, test, lower_case)
method_class.accuracy(result)
# Plot confusion matrix
make_dirs("Plots")
print(result)
fig, ax = plot_confusion_matrix(result['truth'], result['pred'], normalize=True)
ax.set_title("Normalized Confusion Matrix: {}".format(method.title()))
fig.tight_layout()
fig.savefig("Plots/{}.png".format(method))

One Answer

Editing my answer -

def predict(self, train_file: str, test_file: str, lower_case: bool) -> pd.DataFrame:
    "Train model using sklearn pipeline"
    train_df = self.read_data(train_file, lower_case)
    dev_df = self.read_data(dev_file, lower_case)
    learner = self.pipeline.fit(train_df['text'], train_df['truth'])
    # Fit the learner to the test data
    test_df = self.read_data(test_file, lower_case)
    # assuming dev_df is validset
    dev_df['pred'] = learner.predict(dev_df['text'])
    test_df['pred'] = learner.predict(test_df['text'])
    return test_df, dev_df

use two variables when you are calling the predict function to store tes and dev results

Answered by Madhur Yadav on December 24, 2020

Add your own answers!

Ask a Question

Get help from others!

© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP