Hi all,
I am trying to create a simple linear regression model in python but I am getting an error.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
import pandas as pd import matplotlib as plt import numpy as np df = pd.DataFrame df = pd.read_csv( 'C:\\Users\\User\\Downloads\\HRDataset_v14.csv' ) from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score # Convert categorical columns to numerical df_encoded = pd.get_dummies(df, drop_first = True ) # Features (X) and target (y) X = df_encoded[ 'EngagementSurvey' ] y = df_encoded[ 'Absences' ] # Train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 42 ) # Model training linear_model = LinearRegression() linear_model.fit(X_train, y_train) # Predictions y_pred = linear_model.predict(X_test) # Evaluation mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print ( f "Mean Squared Error: {mse}" ) print ( f "R-squared: {r2}" ) |
The code is generating the following error, not sure why?
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[11], line 17
15 # Model training
16 linear_model = LinearRegression()
---> 17 linear_model.fit(X_train, y_train)
19 # Predictions
20 y_pred = linear_model.predict(X_test)
File ~\anaconda3\Lib\site-packages\sklearn\linear_model\_base.py:648, in LinearRegression.fit(self, X, y, sample_weight)
644 n_jobs_ = self.n_jobs
646 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 648 X, y = self._validate_data(
649 X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
650 )
652 sample_weight = _check_sample_weight(
653 sample_weight, X, dtype=X.dtype, only_non_negative=True
654 )
656 X, y, X_offset, y_offset, X_scale = _preprocess_data(
657 X,
658 y,
(...)
661 sample_weight=sample_weight,
662 )
File ~\anaconda3\Lib\site-packages\sklearn\base.py:584, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
582 y = check_array(y, input_name="y", **check_y_params)
583 else:
--> 584 X, y = check_X_y(X, y, **check_params)
585 out = X, y
587 if not no_val_X and check_params.get("ensure_2d", True):
File ~\anaconda3\Lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1101 estimator_name = _check_estimator_name(estimator)
1102 raise ValueError(
1103 f"{estimator_name} requires y to be passed, but the target y is None"
1104 )
-> 1106 X = check_array(
1107 X,
1108 accept_sparse=accept_sparse,
1109 accept_large_sparse=accept_large_sparse,
1110 dtype=dtype,
1111 order=order,
1112 copy=copy,
1113 force_all_finite=force_all_finite,
1114 ensure_2d=ensure_2d,
1115 allow_nd=allow_nd,
1116 ensure_min_samples=ensure_min_samples,
1117 ensure_min_features=ensure_min_features,
1118 estimator=estimator,
1119 input_name="X",
1120 )
1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1124 check_consistent_length(X, y)
File ~\anaconda3\Lib\site-packages\sklearn\utils\validation.py:902, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
900 # If input is 1D raise error
901 if array.ndim == 1:
--> 902 raise ValueError(
903 "Expected 2D array, got 1D array instead:\narray={}.\n"
904 "Reshape your data either using array.reshape(-1, 1) if "
905 "your data has a single feature or array.reshape(1, -1) "
906 "if it contains a single sample.".format(array)
907 )
909 if dtype_numeric and array.dtype.kind in "USV":
910 raise ValueError(
911 "dtype='numeric' is not compatible with arrays of bytes/strings."
912 "Convert your data to numeric values explicitly instead."
913 )
ValueError: Expected 2D array, got 1D array instead:
array=[4.3 4.6 4.2 5. 4.52 4.5 5. 4.63 3.04 5. 3.19 5. 1.81 4.73
3.07 4.3 5. 3.98 3.6 3.75 4.1 1.56 4.2 3.79 4.4 4.2 3.51 4.2
4. 2.9 5. 4.5 2.3 5. 4.2 3.4 3.69 5. 4.62 4.94 4.96 4.4
3.7 3.38 4.1 1.93 4.5 1.12 3.5 3.93 4.78 3.6 4.64 4.5 2.4 5.
4.1 3.24 2.1 4.3 3.49 4.76 3.27 4.53 4.07 4.5 3.08 3.81 4.16 3.21
3.03 4.43 2.6 5. 4.7 3. 4.6 4.1 5. 4.21 4.2 4.18 4.5 3.02
5. 4.4 5. 4.2 3.66 3.9 5. 3.4 4.5 3. 4.68 4.5 4.33 4.1
4.8 3.73 4.4 5. 4.25 2.3 4.8 5. 4.28 3.84 3.35 4.64 5. 3.32
4.5 4.5 4.1 2. 4.3 5. 4.36 5. 3.13 3.8 3.72 5. 5. 4.6
3.89 4.6 3.69 4.1 4.5 4.1 4.1 4.24 4.65 4.3 5. 3.1 4.83 4.2
4.7 5. 4.2 3.2 5. 3.93 3.39 5. 5. 4.2 3.14 4.6 4.1 5.
5. 3.99 5. 2.4 4.5 3.6 5. 4.46 3.01 4.5 4.3 3.8 4.1 4.46
5. 5. 4.6 3.45 3.1 2. 4.7 4.3 4.7 3.6 4.28 5. 5. 4.81
3.6 4.1 3.18 5. 3.58 3.66 4.36 4.15 2.44 4. 4.9 3.3 4.2 2.39
3.8 3.97 2. 4.96 4.2 5. 2.33 3.25 2.4 1.2 4.4 4.83 5. 4.6
4. 3.4 4.53 3.4 3. 3.42 3.6 4.13 4.2 5. 3.7 5. 4.37 4.
4.12 4.7 5. 3.31 4.2 5. 4.5 4.7 4.3 4.3 4.3 4.48 3.6 4.7
4.4 3.54 4.5 4.5 5. 3. 4.3 3.9 4.1 3.73].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Any help is greatly appreciated.
Regards,
ForrestGump