"""KDDCUP 99 dataset. A classic dataset for anomaly detection. The dataset page is available from UCI Machine Learning Repository https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz """ import errno from gzip import GzipFile import logging import os from os.path import dirname, exists, join import numpy as np import joblib from ._base import _fetch_remote from ._base import _convert_data_dataframe from . import get_data_home from ._base import RemoteFileMetadata from ..utils import Bunch from ..utils import check_random_state from ..utils import shuffle as shuffle_method from ..utils.validation import _deprecate_positional_args # The original data can be found at: # https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz ARCHIVE = RemoteFileMetadata( filename='kddcup99_data', url='https://ndownloader.figshare.com/files/5976045', checksum=('3b6c942aa0356c0ca35b7b595a26c89d' '343652c9db428893e7494f837b274292')) # The original data can be found at: # https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz ARCHIVE_10_PERCENT = RemoteFileMetadata( filename='kddcup99_10_data', url='https://ndownloader.figshare.com/files/5976042', checksum=('8045aca0d84e70e622d1148d7df78249' '6f6333bf6eb979a1b0837c42a9fd9561')) logger = logging.getLogger(__name__) @_deprecate_positional_args def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False, random_state=None, percent10=True, download_if_missing=True, return_X_y=False, as_frame=False): """Load the kddcup99 dataset (classification). Download it if necessary. ================= ==================================== Classes 23 Samples total 4898431 Dimensionality 41 Features discrete (int) or continuous (float) ================= ==================================== Read more in the :ref:`User Guide `. .. versionadded:: 0.18 Parameters ---------- subset : {'SA', 'SF', 'http', 'smtp'}, default=None To return the corresponding classical subsets of kddcup 99. If None, return the entire kddcup 99 dataset. data_home : str, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. .. versionadded:: 0.19 shuffle : bool, default=False Whether to shuffle dataset. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset shuffling and for selection of abnormal samples if `subset='SA'`. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. percent10 : bool, default=True Whether to load only 10 percent of the data. download_if_missing : bool, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. return_X_y : bool, default=False If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` object. .. versionadded:: 0.20 as_frame : bool, default=False If `True`, returns a pandas Dataframe for the ``data`` and ``target`` objects in the `Bunch` returned object; `Bunch` return object will also have a ``frame`` member. .. versionadded:: 0.24 Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : {ndarray, dataframe} of shape (494021, 41) The data matrix to learn. If `as_frame=True`, `data` will be a pandas DataFrame. target : {ndarray, series} of shape (494021,) The regression target for each sample. If `as_frame=True`, `target` will be a pandas Series. frame : dataframe of shape (494021, 42) Only present when `as_frame=True`. Contains `data` and `target`. DESCR : str The full description of the dataset. feature_names : list The names of the dataset columns target_names: list The names of the target columns (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 """ data_home = get_data_home(data_home=data_home) kddcup99 = _fetch_brute_kddcup99( data_home=data_home, percent10=percent10, download_if_missing=download_if_missing ) data = kddcup99.data target = kddcup99.target feature_names = kddcup99.feature_names target_names = kddcup99.target_names if subset == 'SA': s = target == b'normal.' t = np.logical_not(s) normal_samples = data[s, :] normal_targets = target[s] abnormal_samples = data[t, :] abnormal_targets = target[t] n_samples_abnormal = abnormal_samples.shape[0] # selected abnormal samples: random_state = check_random_state(random_state) r = random_state.randint(0, n_samples_abnormal, 3377) abnormal_samples = abnormal_samples[r] abnormal_targets = abnormal_targets[r] data = np.r_[normal_samples, abnormal_samples] target = np.r_[normal_targets, abnormal_targets] if subset == 'SF' or subset == 'http' or subset == 'smtp': # select all samples with positive logged_in attribute: s = data[:, 11] == 1 data = np.c_[data[s, :11], data[s, 12:]] feature_names = feature_names[:11] + feature_names[12:] target = target[s] data[:, 0] = np.log((data[:, 0] + 0.1).astype(float, copy=False)) data[:, 4] = np.log((data[:, 4] + 0.1).astype(float, copy=False)) data[:, 5] = np.log((data[:, 5] + 0.1).astype(float, copy=False)) if subset == 'http': s = data[:, 2] == b'http' data = data[s] target = target[s] data = np.c_[data[:, 0], data[:, 4], data[:, 5]] feature_names = [feature_names[0], feature_names[4], feature_names[5]] if subset == 'smtp': s = data[:, 2] == b'smtp' data = data[s] target = target[s] data = np.c_[data[:, 0], data[:, 4], data[:, 5]] feature_names = [feature_names[0], feature_names[4], feature_names[5]] if subset == 'SF': data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]] feature_names = [feature_names[0], feature_names[2], feature_names[4], feature_names[5]] if shuffle: data, target = shuffle_method(data, target, random_state=random_state) module_path = dirname(__file__) with open(join(module_path, 'descr', 'kddcup99.rst')) as rst_file: fdescr = rst_file.read() frame = None if as_frame: frame, data, target = _convert_data_dataframe( "fetch_kddcup99", data, target, feature_names, target_names ) if return_X_y: return data, target return Bunch( data=data, target=target, frame=frame, target_names=target_names, feature_names=feature_names, DESCR=fdescr, ) def _fetch_brute_kddcup99(data_home=None, download_if_missing=True, percent10=True): """Load the kddcup99 dataset, downloading it if necessary. Parameters ---------- data_home : str, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : bool, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. percent10 : bool, default=True Whether to load only 10 percent of the data. Returns ------- dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : ndarray of shape (494021, 41) Each row corresponds to the 41 features in the dataset. target : ndarray of shape (494021,) Each value corresponds to one of the 21 attack types or to the label 'normal.'. feature_names : list The names of the dataset columns target_names: list The names of the target columns DESCR : str Description of the kddcup99 dataset. """ data_home = get_data_home(data_home=data_home) dir_suffix = "-py3" if percent10: kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix) archive = ARCHIVE_10_PERCENT else: kddcup_dir = join(data_home, "kddcup99" + dir_suffix) archive = ARCHIVE samples_path = join(kddcup_dir, "samples") targets_path = join(kddcup_dir, "targets") available = exists(samples_path) dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'), ('flag', 'S6'), ('src_bytes', int), ('dst_bytes', int), ('land', int), ('wrong_fragment', int), ('urgent', int), ('hot', int), ('num_failed_logins', int), ('logged_in', int), ('num_compromised', int), ('root_shell', int), ('su_attempted', int), ('num_root', int), ('num_file_creations', int), ('num_shells', int), ('num_access_files', int), ('num_outbound_cmds', int), ('is_host_login', int), ('is_guest_login', int), ('count', int), ('srv_count', int), ('serror_rate', float), ('srv_serror_rate', float), ('rerror_rate', float), ('srv_rerror_rate', float), ('same_srv_rate', float), ('diff_srv_rate', float), ('srv_diff_host_rate', float), ('dst_host_count', int), ('dst_host_srv_count', int), ('dst_host_same_srv_rate', float), ('dst_host_diff_srv_rate', float), ('dst_host_same_src_port_rate', float), ('dst_host_srv_diff_host_rate', float), ('dst_host_serror_rate', float), ('dst_host_srv_serror_rate', float), ('dst_host_rerror_rate', float), ('dst_host_srv_rerror_rate', float), ('labels', 'S16')] column_names = [c[0] for c in dt] target_names = column_names[-1] feature_names = column_names[:-1] if download_if_missing and not available: _mkdirp(kddcup_dir) logger.info("Downloading %s" % archive.url) _fetch_remote(archive, dirname=kddcup_dir) DT = np.dtype(dt) logger.debug("extracting archive") archive_path = join(kddcup_dir, archive.filename) file_ = GzipFile(filename=archive_path, mode='r') Xy = [] for line in file_.readlines(): line = line.decode() Xy.append(line.replace('\n', '').split(',')) file_.close() logger.debug('extraction done') os.remove(archive_path) Xy = np.asarray(Xy, dtype=object) for j in range(42): Xy[:, j] = Xy[:, j].astype(DT[j]) X = Xy[:, :-1] y = Xy[:, -1] # XXX bug when compress!=0: # (error: 'Incorrect data length while decompressing[...] the file # could be corrupted.') joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) elif not available: if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") try: X, y except NameError: X = joblib.load(samples_path) y = joblib.load(targets_path) return Bunch( data=X, target=y, feature_names=feature_names, target_names=[target_names], ) def _mkdirp(d): """Ensure directory d exists (like mkdir -p on Unix) No guarantee that the directory is writable. """ try: os.makedirs(d) except OSError as e: if e.errno != errno.EEXIST: raise