"""
Reference: Informer (2020).
"""
0 导入库
from typing import List
import numpy as np
import pandas as pd
from pandas.tseries import offsets
from pandas.tseries.frequencies import to_offset
1 TimeFeature
基类,没有实际功能,只定义了基本接口,包括 __call__
和 __repr__
方法
python笔记:类方法之 __call__ 和__repr__-CSDN博客
class TimeFeature:
def __init__(self):
pass
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
pass
def __repr__(self):
return self.__class__.__name__ + "()"
2 SecondOfMinute
提取时间戳对应分钟内的秒数,并标准化到 [-0.5, 0.5] 范围内
class SecondOfMinute(TimeFeature):
"""Minute of hour encoded as value between [-0.5, 0.5]"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return index.second / 59.0 - 0.5
3 MinuteOfHour
提取时间戳对应小时内的分钟数,并标准化到 [-0.5, 0.5] 范围内
class MinuteOfHour(TimeFeature):
"""Minute of hour encoded as value between [-0.5, 0.5]"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return index.minute / 59.0 - 0.5
4 HourOfDay
提取时间戳对应一天内的小时数,并标准化到 [-0.5, 0.5] 范围内
class HourOfDay(TimeFeature):
"""Hour of day encoded as value between [-0.5, 0.5]"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return index.hour / 23.0 - 0.5
5 DayOfWeek
提取时间戳对应一周内的天数(星期几),并标准化到 [-0.5, 0.5] 范围内
class DayOfWeek(TimeFeature):
"""Hour of day encoded as value between [-0.5, 0.5]"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return index.dayofweek / 6.0 - 0.5
6 DayOfMonth
提取时间戳对应一个月内的天数,并标准化到 [-0.5, 0.5] 范围内
class DayOfMonth(TimeFeature):
"""Day of month encoded as value between [-0.5, 0.5]"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return (index.day - 1) / 30.0 - 0.5
7 DayOfYear
提取时间戳对应一年内的天数,并标准化到 [-0.5, 0.5] 范围内
class DayOfYear(TimeFeature):
"""Day of year encoded as value between [-0.5, 0.5]"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return (index.dayofyear - 1) / 365.0 - 0.5
8 MonthOfYear
提取时间戳对应一年内的天数,并标准化到 [-0.5, 0.5] 范围内
class MonthOfYear(TimeFeature):
"""Month of year encoded as value between [-0.5, 0.5]"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return (index.month - 1) / 11.0 - 0.5
9 WeekOfYear
提取时间戳对应一年的周数,并标准化到 [-0.5, 0.5] 范围内
class WeekOfYear(TimeFeature):
"""Week of year encoded as value between [-0.5, 0.5]"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return (index.week - 1) / 52.0 - 0.5
10time_features_from_frequency_str
根据传入的频率字符串 freq_str
返回相应的时间特征列表
def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
features_by_offsets = {
offsets.YearEnd: [],
offsets.QuarterEnd: [MonthOfYear],
offsets.MonthEnd: [MonthOfYear],
offsets.Week: [DayOfMonth, WeekOfYear],
offsets.Day: [DayOfWeek, DayOfMonth, DayOfYear],
offsets.BusinessDay: [DayOfWeek, DayOfMonth, DayOfYear],
offsets.Hour: [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear],
offsets.Minute: [
MinuteOfHour,
HourOfDay,
DayOfWeek,
DayOfMonth,
DayOfYear,
],
offsets.Second: [
SecondOfMinute,
MinuteOfHour,
HourOfDay,
DayOfWeek,
DayOfMonth,
DayOfYear,
],
}
'''
这是一个映射 pandas 时间偏移类型到对应时间特征类的字典:
键是 offsets 模块中的各种时间偏移类型(如 offsets.YearEnd、offsets.MonthEnd 等)。
值是与这些偏移类型相关的时间特征类列表,如 MonthOfYear、WeekOfYear 等
'''
offset = to_offset(freq_str)
'''
将 字符串类型的freq_str 转换为一个 pandas 时间偏移对象 offset,以便在 features_by_offsets 字典中查找对应的特征类列表
'''
for offset_type, feature_classes in features_by_offsets.items():
if isinstance(offset, offset_type):
return [cls() for cls in feature_classes]
'''
遍历 features_by_offsets 字典,将 offset 与字典中的键类型进行比较
如果匹配,则返回与之对应的特征类实例列表
'''
supported_freq_msg = f"""
Unsupported frequency {freq_str}
The following frequencies are supported:
Y - yearly
alias: A
M - monthly
W - weekly
D - daily
B - business days
H - hourly
T - minutely
alias: min
S - secondly
"""
'''
如果 offset 与任何 features_by_offsets 中的键都不匹配,则函数会抛出一个 RuntimeError 异常
并提供一个错误消息,指出当前支持的频率列表,包括 Y, M, W, D, B, H, T, S 及其别名
'''
raise RuntimeError(supported_freq_msg)
11 time_features
从一个包含日期信息的 dates
数据框中提取日期特征,并根据参数 timeenc
和 freq
对这些特征进行编码
def time_features(dates, timeenc=1, freq="h"):
if timeenc == 0:
dates["month"] = dates.date.apply(lambda row: row.month, 1)
dates["day"] = dates.date.apply(lambda row: row.day, 1)
dates["weekday"] = dates.date.apply(lambda row: row.weekday(), 1)
dates["hour"] = dates.date.apply(lambda row: row.hour, 1)
dates["minute"] = dates.date.apply(lambda row: row.minute, 1)
dates["minute"] = dates.minute.map(lambda x: x // 15)
'''
首先为 dates 数据框添加一些列,如 month, day, weekday, hour, minute,以此记录日期信息的相应特征
特别地,minute 列会使用 .map(lambda x: x // 15) 将分钟数标准化到一个 0 到 3 之间的值,对应 15 分钟的时间段
'''
freq_map = {
"y": [],
"m": ["month"],
"w": ["month"],
"d": ["month", "day", "weekday"],
"b": ["month", "day", "weekday"],
"h": ["month", "day", "weekday", "hour"],
"t": ["month", "day", "weekday", "hour", "minute"],
}
'''
freq_map 是一个字典,将 freq 字符串映射到特定的特征列表,然后返回这些特征对应的列的值。
'''
return dates[freq_map[freq.lower()]].values
if timeenc == 1:
dates = pd.to_datetime(dates.date.values)
#会将 dates 转换为 pd.DatetimeIndex 对象
return np.vstack(
[feat(dates) for feat in time_features_from_frequency_str(freq)]
).transpose(1, 0)
'''
调用 time_features_from_frequency_str 根据 freq 提取时间特征类列表。
结果是一个二维数组,每一列对应一种时间特征
通过 np.vstack 将不同时间特征垂直堆叠,并转置,使得每一行代表日期,每一列代表特征
'''
11.1 举例:
假设有一个包含日期信息的数据框 dates
,并且我们希望提取与这些日期相关的时间特征:
import pandas as pd
# 构造一个数据框,包含日期信息
data = {'date': ["2024-05-01 12:00:00",
"2024-05-02 08:30:00",
"2024-05-03 14:45:00"]}
dates = pd.DataFrame(data)
dates['date'] = pd.to_datetime(dates['date'])
# 将日期字符串转换为 pandas 的 datetime 对象
调用 time_features
函数提取时间特征。比如说我们希望提取以小时为粒度的时间特征,并且 timeenc
为 1:
time_features(dates, timeenc=1, freq="H")
'''
array([[ 0.02173913, -0.16666667, -0.5 , -0.16849315],
[-0.15217391, 0. , -0.46666667, -0.16575342],
[ 0.10869565, 0.16666667, -0.43333333, -0.1630137 ]])
'''
输出将是一个二维数组,其中每一列对应一种时间特征,每一行对应 dates
中的一行
HourOfDay
: 对应于每个日期的小时特征,范围 [-0.5, 0.5]DayOfWeek
: 对应于每个日期的星期几,范围 [-0.5, 0.5]DayOfMonth
: 对应于每个日期的一个月中的第几天,范围 [-0.5, 0.5]DayOfYear
: 对应于每个日期在一年中的第几天,范围 [-0.5, 0.5]
11.2 return np.vstack(
[feat(dates) for feat in time_features_from_frequency_str(freq)]
).transpose(1, 0)的解读
time_features_from_frequency_str('H')
#[HourOfDay(), DayOfWeek(), DayOfMonth(), DayOfYear()]
for feat in time_features_from_frequency_str('H'):
print(feat(dates1))
'''
Index([0.021739130434782594, -0.15217391304347827, 0.10869565217391308], dtype='float64')
Index([-0.16666666666666669, 0.0, 0.16666666666666663], dtype='float64')
Index([-0.5, -0.4666666666666667, -0.43333333333333335], dtype='float64')
Index([-0.16849315068493148, -0.16575342465753423, -0.16301369863013698], dtype='float64')
'''
[feat(dates1) for feat in time_features_from_frequency_str('H')]
'''
[Index([0.021739130434782594, -0.15217391304347827, 0.10869565217391308], dtype='float64'),
Index([-0.16666666666666669, 0.0, 0.16666666666666663], dtype='float64'),
Index([-0.5, -0.4666666666666667, -0.43333333333333335], dtype='float64'),
Index([-0.16849315068493148, -0.16575342465753423, -0.16301369863013698], dtype='float64')]
'''
np.vstack(
[feat(dates1) for feat in time_features_from_frequency_str('H')]
)
'''
array([[ 0.02173913, -0.15217391, 0.10869565],
[-0.16666667, 0. , 0.16666667],
[-0.5 , -0.46666667, -0.43333333],
[-0.16849315, -0.16575342, -0.1630137 ]])
'''