create venv
conda create -n [env_name] python=3.10 anacondaactivate venv
conda activate [env_name]pip list
- seaborn
- scikit-learn
- matplotlib
- pandas
- numpy
- flet
- scipy
- statsmodels
- dash
- bs4
- requests
- psycopg2
- plotly
- pymorphy2
Визуализация кластеризации
Python:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="start_lon", y="start_lat", hue="kmeans_cluster", palette="deep")
plt.grid(True)
plt.show()
Оценка качества кластеризации
Python:
df['kmeans_cluster'].value_counts()
silhouette_score
Python:
from sklearn.metrics import silhouette_score
silhouette_score(X, kmeans.labels_)
Это значение должно быть приближено к +1 если оно маленькое - то модель слишком лояльна если оно большое - кластеризация не четко определена и может быть неоднозначной
Кластеризация
K-means
Python:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
kmeans.fit(X)
df['kmeans_cluster'] = kmeans.labels_
AgglomerativeClustering
Python:
from sklearn.cluster import AgglomerativeClustering
agglomerative_cluster = AgglomerativeClustering(n_clusters=4)
agglomerative_cluster.fit(X)
df['agglomerative_cluster'] = agglomerative_cluster.labels_
DBSCAN
Python:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.1, min_samples=5)
dbscan.fit(X)
df['dbscan_cluster'] = dbscan.labels_
convert types
to numeric
Python:
df["start_lat"] = df["start_lat"].str.replace(",", ".")
df["start_lat"] = pd.to_numeric(df["start_lat"])
df.dtypes
to datetime
df['start_at'] = pd.to_datetime(df['start_at'])
prepare
count zeroes
Python:
for col in df.columns:
print(f"{col} : {df[col].isnull().sum()}")
dropna
df = df.dropna(subset=["source"])
anomalies
for example: start time > end time
Python:
print((df["start_at"] > df["end_at"]).sum())
df = df.drop(df[df["start_at"] > df["end_at"]].index)
Построение временного ряда
Python:
time_series = df
# Разделяем время на часы, минуты и тд
time_series['hour'] = time_series['start_at'].dt.hour
time_series['day_of_week'] = time_series['start_at'].dt.dayofweek
time_series['day_of_month'] = time_series['start_at'].dt.day
time_series['month'] = time_series['start_at'].dt.month
# Отбрасываем все данные кроме времени
time_series = time_series.drop(axis=1, labels=["journey_id", "end_state", "user_id", "driver_start_lat", "driver_start_lon", "end_lat", "end_lon", "start_lat", "start_lon", "agglomerative_cluster", "dbscan_cluster", "driver_id", "taxi_id", "icon", "start_type"])
# Высчитываем количество записей в df с периодом в 1 час
time_series_hours = time_series.resample('H', on="start_at").size()
Собираем X и Y для обучения
Python:
X = time_series.resample('H', on="start_at").agg({'hour': 'first', 'day_of_week': 'first', 'day_of_month': 'first', 'month': 'first'})
X.dropna(axis=0, inplace=True)
y = time_series_hours[time_series_hours != 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
Визуализация предикта
Python:
# Построение графика
plt.figure(figsize=(15, 6))
plt.plot(y_test.index, y_test, label='Реальные значения', color='blue')
plt.plot(y_test.index, rf_predictions, label='Предсказанные значения', color='red')
plt.plot(y_train.index, y_train, label='Реальные тренировочные значения', color='green')
# Настройка внешнего вида графика
plt.xlabel('Дата/Время')
plt.ylabel('Количество заказов')
plt.title('Сравнение реальных и предсказанных значений RF classifier')
plt.legend()
# Отображение графика
plt.grid(True)
plt.show()
Функция предикта для временного ряда
Python:
def predict_period(start_date, end_date, model):
date_range = pd.date_range(start=start_date, end=end_date, freq='H')
# Создаем DataFrame для хранения данных
data = pd.DataFrame(index=date_range)
data['hour'] = data.index.hour
data['day_of_week'] = data.index.dayofweek
data['day_of_month'] = data.index.day
data['month'] = data.index.month
data['hour'] = data['hour'].astype(np.float64)
data['day_of_week'] = data['day_of_week'].astype(np.float64)
data['day_of_month'] = data['day_of_month'].astype(np.float64)
data['month'] = data['month'].astype(np.float64)
return np.int64(np.round(model.predict(data)))
Python:
import flet as ft
import datetime as dt
class DatePicker(ft.Container):
def __init__(self, page : ft.Page, *args, **kwargs):
super().__init__(*args, **kwargs)
self.date_picker = ft.DatePicker(
on_change=self.change_date,
on_dismiss=self.change_date,
)
self.time_picker = ft.TimePicker(
on_change=self.change_date,
on_dismiss=self.change_date,
)
page.overlay.append(self.time_picker)
page.overlay.append(self.date_picker)
self.date_picker_button = ft.TextButton(
"Pick 1 date",
on_click=self.pick_date,
width=200,
style=ft.ButtonStyle(
bgcolor=ft.colors.BLACK12
)
)
self.time_picker_button = ft.TextButton(
"Pick 1 time",
on_click=self.pick_time,
width=200,
style=ft.ButtonStyle(
bgcolor=ft.colors.BLACK12
)
)
self.date_text = ft.Text(value=dt.datetime.now().strftime("%Y-%m-%d"), size=24)
self.time_text = ft.Text(value=dt.datetime.now().strftime("%H:%M:%S"), size=24)
self.container = ft.Container(
ft.Column(
[
ft.Row(
[
self.date_text,
self.time_text
],
),
self.date_picker_button,
self.time_picker_button
],
horizontal_alignment=ft.CrossAxisAlignment.CENTER
),
# bgcolor=ft.colors.AMBER_100,
alignment=ft.alignment.bottom_left,
)
self.content = self.container
def change_date(self, e):
if self.date_picker.value is not None:
self.date_text.value = self.date_picker.value.strftime("%Y-%m-%d")
if self.time_picker.value is not None:
self.time_text.value = self.time_picker.value.strftime("%H:%M:%S")
self.update()
def pick_time(self, e):
self.time_picker.pick_time()
def pick_date(self, e):
self.date_picker.pick_date()
def main(page: ft.Page):
date_picker1 = DatePicker(page)
date_picker2 = DatePicker(page)
picker_row = ft.Row(
[
date_picker1,
date_picker2
],
alignment=ft.MainAxisAlignment.SPACE_EVENLY
)
change_cluster = ft.RadioGroup(
content=ft.Row(
[
ft.Radio(label="1", value="1"),
ft.Radio(label="2", value="2"),
ft.Radio(label="3", value="3"),
],
alignment=ft.MainAxisAlignment.CENTER
),
on_change=lambda e: print(e.control.value),
)
page.add(
ft.Container(
picker_row,
# bgcolor=ft.colors.AMBER_300,
),
change_cluster
)
if __name__ == "__main__":
ft.app(target=main)
Python:
import dash
from dash import html
from dash import dcc
import plotly.express as px
import pandas as pd
import asyncio
import psycopg2
from config import *
import bs4
import requests
from pymorphy2 import MorphAnalyzer
def parse_data() -> list:
url = "https://aem-group.ru/mediacenter/informatoriy/skolko-atomnyix-stanczij-rabotaet-v-mire-i-v-rossii.html"
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "lxml")
allInfo = soup.find("div", class_="wysiwyg")
allListItems = allInfo.find("ul").findAll("li")
data = [
]
for li in allListItems:
text = li.text
country = str(text)[str(text).rfind("в ")+2:str(text).find(" на ")]
power = str(text)[str(text).rfind("вырабатывается ") + 15:str(text).find(" МВт ")]
country = MorphAnalyzer(lang="ru").parse(country)[0].normal_form
data.append((country, str(power).replace(" ", "")))
return data
def sql_query(sql: str):
conn = None
try:
conn = psycopg2.connect(
host=host,
user=user,
password=password,
database=db_name
)
cursor = conn.cursor()
cursor.execute(query=sql)
result = cursor.fetchall()
except Exception as e:
print("ERROR:", e)
finally:
if conn:
cursor.close()
conn.close()
return result
def update_data() -> pd.DataFrame:
# threading.Timer(60.0, update_data).start()
global df
data = parse_data()
conn = None
try:
conn = psycopg2.connect(
host=host,
user=user,
password=password,
database=db_name
)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS energy(
id serial PRIMARY KEY,
country varchar(255) NOT NULL,
power bigint NOT NULL,
UNIQUE(country)
)
""")
for line in data:
cursor.execute("INSERT INTO energy (country, power) VALUES (%s, %s) ON CONFLICT (country) DO UPDATE SET country = EXCLUDED.country, power = EXCLUDED.power;", line)
conn.commit()
cursor.execute("""
SELECT * FROM energy
""")
data = cursor.fetchall()
except Exception as e:
print("ERROR:", e)
finally:
if conn:
cursor.close()
conn.close()
print("Соединение с PostgreSQL закрыто")
df = pd.DataFrame(data, columns=["id", "country", "power"])
df = df.drop(labels="id", axis=1)
return df
# Create the app
app = dash.Dash(__name__)
@app.callback(
dash.dependencies.Output('bar-graph', 'figure'),
dash.dependencies.Output('SQL1', 'children'),
[
dash.dependencies.Input('interval', 'n_intervals'),
]
)
def update_graph(num):
print("Обновлен график!")
df = update_data()
fig = px.bar(df, x="country", y="power")
sql1 = sql_query("SELECT country, power FROM energy ORDER BY power DESC LIMIT 1;")
sql2 = ""
sql3 = ""
sql4 = ""
sql5 = ""
sql6 = ""
return fig, "Максимальное потребление: " + str(sql1[0][0]) + " " + str(sql1[0][1]) #, sqls...
@app.callback(
dash.dependencies.Output('graph1', 'figure'),
dash.dependencies.Output('graph2', 'figure'),
[
dash.dependencies.Input('radio', 'value'),
dash.dependencies.Input('interval', 'n_intervals'),
]
)
def update_graph(radio, num):
print(radio)
fig1, fig2 = None, None
if radio == "1":
fig1 = px.bar(df, x="power", y="country", title="ПЕРВАЯ")
fig2 = px.bar({"y": [1, 2, 3], "x": ["a", "b", "c"]}, x="x", y="y", title="ПЕРВАЯ")
elif radio == "2":
fig1 = px.bar({"y2": [10, 20, 30], "x2": ["a", "b", "c"]}, x="x2", y="y2", title="ВТОРАЯ")
fig2 = px.bar({"y2": [10, 20, 30], "x2": ["a", "b", "c"]}, x="x2", y="y2", title="ВТОРАЯ")
return fig1, fig2
# Load dataset using Plotly
df = update_data()
app.layout = html.Div(children=[
html.H1(children='Dashboard'), # Create a title with H1 tag
html.Div(id="SQL1", style={"font-size": "24px"}),
html.Div(id="SQL2", style={"font-size": "24px"}),
html.Div(id="SQL3", style={"font-size": "24px"}),
html.Div(id="SQL4", style={"font-size": "24px"}),
html.Div(id="SQL5", style={"font-size": "24px"}),
html.Div(id="SQL6", style={"font-size": "24px"}),
dcc.RadioItems(["1", "2", "3", "4", "5", "6"], "1", id="radio"),
dcc.Graph(
id='graph1',
figure={}
),
dcc.Graph(
id='graph2',
figure={}
),
dcc.Graph(
id='bar-graph',
figure={}
),
dcc.Interval(id='interval', interval=10000),
])
if __name__ == '__main__':
app.run_server(debug=True) # Run the Dash app
Последнее редактирование: