# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Setting up the style for plots
sns.set(style="whitegrid")

# Loading the dataset (assuming it's in CSV format)
df = pd.read_csv("NSL_KDD_Train.csv")

# Show first 5 rows of the data
df.head()

df.rename(columns={'0': 'duration'}, inplace=True)
df.rename(columns={'tcp': 'protocol_type'}, inplace=True)
df.rename(columns={'ftp_data': 'service'}, inplace=True)
df.rename(columns={'SF': 'flag'}, inplace=True)
df.rename(columns={'491': 'src_bytes'}, inplace=True)
df.rename(columns={'0.1': 'dst_bytes'}, inplace=True)
df.rename(columns={'normal': 'label'}, inplace=True)

# Plot the distribution of attacks vs normal traffic
plt.figure(figsize=(8,5))
sns.countplot(x='label', data=df)
plt.title("Distribution of Attacks vs Normal Traffic")
plt.show()

# Show correlation between features
df_numeric = df.select_dtypes(include=[np.number])
plt.figure(figsize=(12,8))
sns.heatmap(df_numeric.corr(), cmap='coolwarm', annot=False)
plt.title("Correlation between Features")
plt.show()

# Label Encoding for categorical variables
# Convert categorical columns to numeric using label encoding
df['protocol_type'] = df['protocol_type'].astype('category').cat.codes
df['service'] = df['service'].astype('category').cat.codes
df['flag'] = df['flag'].astype('category').cat.codes
# One-Hot Encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'])


# Splitting data into features (X) and labels (y)
X = df.drop('label', axis=1)
y = df['label']

# Train-test split (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.dtypes)

print(y_train.dtypes)

# Training a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Normal','Attack'], yticklabels=['Normal','Attack'])
plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred, zero_division=0))

Requirments¶

You need to install these packages¶

Or just install Anaconda on your system (AS I did)¶

Network Intrusion Detection System using Machine Learning¶

Overview¶

Problem Statement¶

Rename the columens of dataset¶

Data Overview¶

Exploratory Data Analysis (EDA)¶

Data Preprocessing¶

Checking if all the sets are integrated¶

Model Training¶

Model Evaluation¶

Insights¶

Conclusion¶