Hi all,
I am trying to create a simple linear regression model in python but I am getting an error.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | importpandas as pdimportmatplotlib as pltimportnumpy as npdf=pd.DataFramedf =pd.read_csv('C:\\Users\\User\\Downloads\\HRDataset_v14.csv')fromsklearn.model_selection importtrain_test_splitfromsklearn.linear_model importLinearRegressionfromsklearn.metrics importmean_squared_error, r2_score# Convert categorical columns to numericaldf_encoded =pd.get_dummies(df, drop_first=True)# Features (X) and target (y)X =df_encoded['EngagementSurvey']y =df_encoded['Absences']# Train-test splitX_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=42)# Model traininglinear_model =LinearRegression()linear_model.fit(X_train, y_train)# Predictionsy_pred =linear_model.predict(X_test)# Evaluationmse =mean_squared_error(y_test, y_pred)r2 =r2_score(y_test, y_pred)print(f"Mean Squared Error: {mse}")print(f"R-squared: {r2}") | 
The code is generating the following error, not sure why?
Error:
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 17
     15 # Model training
     16 linear_model = LinearRegression()
---> 17 linear_model.fit(X_train, y_train)
     19 # Predictions
     20 y_pred = linear_model.predict(X_test)
File ~\anaconda3\Lib\site-packages\sklearn\linear_model\_base.py:648, in LinearRegression.fit(self, X, y, sample_weight)
    644 n_jobs_ = self.n_jobs
    646 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 648 X, y = self._validate_data(
    649     X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
    650 )
    652 sample_weight = _check_sample_weight(
    653     sample_weight, X, dtype=X.dtype, only_non_negative=True
    654 )
    656 X, y, X_offset, y_offset, X_scale = _preprocess_data(
    657     X,
    658     y,
   (...)
    661     sample_weight=sample_weight,
    662 )
File ~\anaconda3\Lib\site-packages\sklearn\base.py:584, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    582         y = check_array(y, input_name="y", **check_y_params)
    583     else:
--> 584         X, y = check_X_y(X, y, **check_params)
    585     out = X, y
    587 if not no_val_X and check_params.get("ensure_2d", True):
File ~\anaconda3\Lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1101         estimator_name = _check_estimator_name(estimator)
   1102     raise ValueError(
   1103         f"{estimator_name} requires y to be passed, but the target y is None"
   1104     )
-> 1106 X = check_array(
   1107     X,
   1108     accept_sparse=accept_sparse,
   1109     accept_large_sparse=accept_large_sparse,
   1110     dtype=dtype,
   1111     order=order,
   1112     copy=copy,
   1113     force_all_finite=force_all_finite,
   1114     ensure_2d=ensure_2d,
   1115     allow_nd=allow_nd,
   1116     ensure_min_samples=ensure_min_samples,
   1117     ensure_min_features=ensure_min_features,
   1118     estimator=estimator,
   1119     input_name="X",
   1120 )
   1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
   1124 check_consistent_length(X, y)
File ~\anaconda3\Lib\site-packages\sklearn\utils\validation.py:902, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    900     # If input is 1D raise error
    901     if array.ndim == 1:
--> 902         raise ValueError(
    903             "Expected 2D array, got 1D array instead:\narray={}.\n"
    904             "Reshape your data either using array.reshape(-1, 1) if "
    905             "your data has a single feature or array.reshape(1, -1) "
    906             "if it contains a single sample.".format(array)
    907         )
    909 if dtype_numeric and array.dtype.kind in "USV":
    910     raise ValueError(
    911         "dtype='numeric' is not compatible with arrays of bytes/strings."
    912         "Convert your data to numeric values explicitly instead."
    913     )
ValueError: Expected 2D array, got 1D array instead:
array=[4.3  4.6  4.2  5.   4.52 4.5  5.   4.63 3.04 5.   3.19 5.   1.81 4.73
 3.07 4.3  5.   3.98 3.6  3.75 4.1  1.56 4.2  3.79 4.4  4.2  3.51 4.2
 4.   2.9  5.   4.5  2.3  5.   4.2  3.4  3.69 5.   4.62 4.94 4.96 4.4
 3.7  3.38 4.1  1.93 4.5  1.12 3.5  3.93 4.78 3.6  4.64 4.5  2.4  5.
 4.1  3.24 2.1  4.3  3.49 4.76 3.27 4.53 4.07 4.5  3.08 3.81 4.16 3.21
 3.03 4.43 2.6  5.   4.7  3.   4.6  4.1  5.   4.21 4.2  4.18 4.5  3.02
 5.   4.4  5.   4.2  3.66 3.9  5.   3.4  4.5  3.   4.68 4.5  4.33 4.1
 4.8  3.73 4.4  5.   4.25 2.3  4.8  5.   4.28 3.84 3.35 4.64 5.   3.32
 4.5  4.5  4.1  2.   4.3  5.   4.36 5.   3.13 3.8  3.72 5.   5.   4.6
 3.89 4.6  3.69 4.1  4.5  4.1  4.1  4.24 4.65 4.3  5.   3.1  4.83 4.2
 4.7  5.   4.2  3.2  5.   3.93 3.39 5.   5.   4.2  3.14 4.6  4.1  5.
 5.   3.99 5.   2.4  4.5  3.6  5.   4.46 3.01 4.5  4.3  3.8  4.1  4.46
 5.   5.   4.6  3.45 3.1  2.   4.7  4.3  4.7  3.6  4.28 5.   5.   4.81
 3.6  4.1  3.18 5.   3.58 3.66 4.36 4.15 2.44 4.   4.9  3.3  4.2  2.39
 3.8  3.97 2.   4.96 4.2  5.   2.33 3.25 2.4  1.2  4.4  4.83 5.   4.6
 4.   3.4  4.53 3.4  3.   3.42 3.6  4.13 4.2  5.   3.7  5.   4.37 4.
 4.12 4.7  5.   3.31 4.2  5.   4.5  4.7  4.3  4.3  4.3  4.48 3.6  4.7
 4.4  3.54 4.5  4.5  5.   3.   4.3  3.9  4.1  3.73].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.Any help is greatly appreciated.
Regards,
ForrestGump
