• B правой части каждого сообщения есть стрелки и . Не стесняйтесь оценивать ответы. Чтобы автору вопроса закрыть свой тикет, надо выбрать лучший ответ. Просто нажмите значок в правой части сообщения.

python data training

benvito

New member
04.06.2024
1
0
BIT
14

create venv​

conda create -n [env_name] python=3.10 anaconda

activate venv​

conda activate [env_name]

pip list​

  • seaborn
  • scikit-learn
  • matplotlib
  • pandas
  • numpy
  • flet
  • scipy
  • statsmodels
  • dash
  • bs4
  • requests
  • psycopg2
  • plotly
  • pymorphy2

Визуализация кластеризации​

Python:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="start_lon", y="start_lat", hue="kmeans_cluster", palette="deep")
plt.grid(True)
plt.show()

Оценка качества кластеризации​

Python:
df['kmeans_cluster'].value_counts()

silhouette_score​

Python:
from sklearn.metrics import silhouette_score

silhouette_score(X, kmeans.labels_)

Это значение должно быть приближено к +1 если оно маленькое - то модель слишком лояльна если оно большое - кластеризация не четко определена и может быть неоднозначной


Кластеризация​

K-means​

Python:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
kmeans.fit(X)
df['kmeans_cluster'] = kmeans.labels_


AgglomerativeClustering​

Python:
from sklearn.cluster import AgglomerativeClustering
agglomerative_cluster = AgglomerativeClustering(n_clusters=4)
agglomerative_cluster.fit(X)
df['agglomerative_cluster'] = agglomerative_cluster.labels_


DBSCAN​

Python:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.1, min_samples=5)
dbscan.fit(X)
df['dbscan_cluster'] = dbscan.labels_

convert types​

to numeric​

Python:
df["start_lat"] = df["start_lat"].str.replace(",", ".")
df["start_lat"] = pd.to_numeric(df["start_lat"])
df.dtypes


to datetime​

df['start_at'] = pd.to_datetime(df['start_at'])

prepare​

count zeroes​

Python:
for col in df.columns:
print(f"{col} : {df[col].isnull().sum()}")


dropna​

df = df.dropna(subset=["source"])

anomalies​

for example: start time > end time

Python:
print((df["start_at"] > df["end_at"]).sum())
df = df.drop(df[df["start_at"] > df["end_at"]].index)

Построение временного ряда​

Python:
time_series = df
# Разделяем время на часы, минуты и тд
time_series['hour'] = time_series['start_at'].dt.hour
time_series['day_of_week'] = time_series['start_at'].dt.dayofweek
time_series['day_of_month'] = time_series['start_at'].dt.day
time_series['month'] = time_series['start_at'].dt.month
# Отбрасываем все данные кроме времени
time_series = time_series.drop(axis=1, labels=["journey_id", "end_state", "user_id", "driver_start_lat", "driver_start_lon", "end_lat", "end_lon", "start_lat", "start_lon", "agglomerative_cluster", "dbscan_cluster", "driver_id", "taxi_id", "icon", "start_type"])

# Высчитываем количество записей в df с периодом в 1 час
time_series_hours = time_series.resample('H', on="start_at").size()

Собираем X и Y для обучения​

Python:
X = time_series.resample('H', on="start_at").agg({'hour': 'first', 'day_of_week': 'first', 'day_of_month': 'first', 'month': 'first'})
X.dropna(axis=0, inplace=True)
y = time_series_hours[time_series_hours != 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

Визуализация предикта​

Python:
# Построение графика
plt.figure(figsize=(15, 6))
plt.plot(y_test.index, y_test, label='Реальные значения', color='blue')
plt.plot(y_test.index, rf_predictions, label='Предсказанные значения', color='red')
plt.plot(y_train.index, y_train, label='Реальные тренировочные значения', color='green')

# Настройка внешнего вида графика
plt.xlabel('Дата/Время')
plt.ylabel('Количество заказов')
plt.title('Сравнение реальных и предсказанных значений RF classifier')
plt.legend()

# Отображение графика
plt.grid(True)
plt.show()

Функция предикта для временного ряда​

Python:
def predict_period(start_date, end_date, model):
date_range = pd.date_range(start=start_date, end=end_date, freq='H')
    
# Создаем DataFrame для хранения данных
data = pd.DataFrame(index=date_range)

data['hour'] = data.index.hour
data['day_of_week'] = data.index.dayofweek
data['day_of_month'] = data.index.day
data['month'] = data.index.month

data['hour'] = data['hour'].astype(np.float64)
data['day_of_week'] = data['day_of_week'].astype(np.float64)
data['day_of_month'] = data['day_of_month'].astype(np.float64)
data['month'] = data['month'].astype(np.float64)

return np.int64(np.round(model.predict(data)))


Python:
import flet as ft
import datetime as dt

class DatePicker(ft.Container):
    def __init__(self, page : ft.Page, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.date_picker = ft.DatePicker(
            on_change=self.change_date,
            on_dismiss=self.change_date,
        )

        self.time_picker = ft.TimePicker(
            on_change=self.change_date,
            on_dismiss=self.change_date,
        )

        page.overlay.append(self.time_picker)
        page.overlay.append(self.date_picker)

        self.date_picker_button = ft.TextButton(
            "Pick 1 date",
            on_click=self.pick_date,
            width=200,
            style=ft.ButtonStyle(
                bgcolor=ft.colors.BLACK12
            )
        )

        self.time_picker_button = ft.TextButton(
            "Pick 1 time",
            on_click=self.pick_time,
            width=200,
            style=ft.ButtonStyle(
                bgcolor=ft.colors.BLACK12
            )
        )

        self.date_text = ft.Text(value=dt.datetime.now().strftime("%Y-%m-%d"), size=24)
        self.time_text = ft.Text(value=dt.datetime.now().strftime("%H:%M:%S"), size=24)

        self.container = ft.Container(
            ft.Column(
                [
                    ft.Row(
                        [
                            self.date_text,
                            self.time_text
                        ],
                    ),
                    self.date_picker_button,
                    self.time_picker_button
                ],
                horizontal_alignment=ft.CrossAxisAlignment.CENTER
            ),
            # bgcolor=ft.colors.AMBER_100,
            alignment=ft.alignment.bottom_left,
        )

        self.content = self.container

    def change_date(self, e):
        if self.date_picker.value is not None:
            self.date_text.value = self.date_picker.value.strftime("%Y-%m-%d")
        if self.time_picker.value is not None:
            self.time_text.value = self.time_picker.value.strftime("%H:%M:%S")
        self.update()

    def pick_time(self, e):
        self.time_picker.pick_time()

    def pick_date(self, e):
        self.date_picker.pick_date()

def main(page: ft.Page):
    date_picker1 = DatePicker(page)
    date_picker2 = DatePicker(page)


    picker_row = ft.Row(
            [
                date_picker1,
                date_picker2
            ],
            alignment=ft.MainAxisAlignment.SPACE_EVENLY
        )

    change_cluster = ft.RadioGroup(
        content=ft.Row(
            [
                ft.Radio(label="1", value="1"),
                ft.Radio(label="2", value="2"),
                ft.Radio(label="3", value="3"),
            ],
            alignment=ft.MainAxisAlignment.CENTER
        ),
        on_change=lambda e: print(e.control.value),
    )
    
    page.add(
            ft.Container(
                picker_row,
                # bgcolor=ft.colors.AMBER_300,
            ),
            change_cluster
        )


if __name__ == "__main__":
    ft.app(target=main)

Python:
import dash
from dash import html
from dash import dcc
import plotly.express as px
import pandas as pd
import asyncio
import psycopg2
from config import *
import bs4
import requests
from pymorphy2 import MorphAnalyzer

def parse_data() -> list:
    url = "https://aem-group.ru/mediacenter/informatoriy/skolko-atomnyix-stanczij-rabotaet-v-mire-i-v-rossii.html"

    response = requests.get(url)

    soup = bs4.BeautifulSoup(response.text, "lxml")

    allInfo = soup.find("div", class_="wysiwyg")

    allListItems = allInfo.find("ul").findAll("li")

    data = [

    ]

    for li in allListItems:
        text = li.text
        country = str(text)[str(text).rfind("в ")+2:str(text).find(" на ")]
        power = str(text)[str(text).rfind("вырабатывается ") + 15:str(text).find(" МВт ")]
        country = MorphAnalyzer(lang="ru").parse(country)[0].normal_form
        data.append((country, str(power).replace(" ", "")))

    return data

def sql_query(sql: str):
    conn = None
    try:
        conn = psycopg2.connect(
            host=host,
            user=user,
            password=password,
            database=db_name
        )

        cursor = conn.cursor()

        cursor.execute(query=sql)

        result = cursor.fetchall()
    except Exception as e:
        print("ERROR:", e)
    finally:
        if conn:
            cursor.close()
            conn.close()
    return result

def update_data() -> pd.DataFrame:
    # threading.Timer(60.0, update_data).start()
    global df
    data = parse_data()
    conn = None
    try:
        conn = psycopg2.connect(
            host=host,
            user=user,
            password=password,
            database=db_name
        )

        cursor = conn.cursor()
       
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS energy(
                    id serial PRIMARY KEY,
                    country varchar(255) NOT NULL,
                    power bigint NOT NULL,
                    UNIQUE(country)
            )
                    """)
       
        for line in data:
            cursor.execute("INSERT INTO energy (country, power) VALUES (%s, %s) ON CONFLICT (country) DO UPDATE SET country = EXCLUDED.country, power = EXCLUDED.power;", line)
       
        conn.commit()

        cursor.execute("""
        SELECT * FROM energy
                   """)

        data = cursor.fetchall()
    except Exception as e:
        print("ERROR:", e)
    finally:
        if conn:
            cursor.close()
            conn.close()
            print("Соединение с PostgreSQL закрыто")
    df = pd.DataFrame(data, columns=["id", "country", "power"])
    df = df.drop(labels="id", axis=1)

    return df

# Create the app
app = dash.Dash(__name__)

@app.callback(
    dash.dependencies.Output('bar-graph', 'figure'),
    dash.dependencies.Output('SQL1', 'children'),

    [
        dash.dependencies.Input('interval', 'n_intervals'),
    ]
)
def update_graph(num):
    print("Обновлен график!")
    df = update_data()
    fig = px.bar(df, x="country", y="power")
    sql1 = sql_query("SELECT country, power FROM energy ORDER BY power DESC LIMIT 1;")
    sql2 = ""
    sql3 = ""
    sql4 = ""
    sql5 = ""
    sql6 = ""
    return fig, "Максимальное потребление: " + str(sql1[0][0]) + " " + str(sql1[0][1]) #, sqls...

@app.callback(
    dash.dependencies.Output('graph1', 'figure'),
    dash.dependencies.Output('graph2', 'figure'),
    [
        dash.dependencies.Input('radio', 'value'),
        dash.dependencies.Input('interval', 'n_intervals'),
    ]
)
def update_graph(radio, num):
    print(radio)
    fig1, fig2 = None, None
    if radio == "1":
        fig1 = px.bar(df, x="power", y="country", title="ПЕРВАЯ")
        fig2 = px.bar({"y": [1, 2, 3], "x": ["a", "b", "c"]}, x="x", y="y", title="ПЕРВАЯ")
    elif radio == "2":
        fig1 = px.bar({"y2": [10, 20, 30], "x2": ["a", "b", "c"]}, x="x2", y="y2", title="ВТОРАЯ")
        fig2 = px.bar({"y2": [10, 20, 30], "x2": ["a", "b", "c"]}, x="x2", y="y2", title="ВТОРАЯ")

    return fig1, fig2
# Load dataset using Plotly
df = update_data()

app.layout = html.Div(children=[
    html.H1(children='Dashboard'),  # Create a title with H1 tag
    html.Div(id="SQL1", style={"font-size": "24px"}),
    html.Div(id="SQL2", style={"font-size": "24px"}),
    html.Div(id="SQL3", style={"font-size": "24px"}),
    html.Div(id="SQL4", style={"font-size": "24px"}),
    html.Div(id="SQL5", style={"font-size": "24px"}),
    html.Div(id="SQL6", style={"font-size": "24px"}),
    dcc.RadioItems(["1", "2", "3", "4", "5", "6"], "1", id="radio"),
    dcc.Graph(
        id='graph1',
        figure={}
    ),
    dcc.Graph(
        id='graph2',
        figure={}
    ),
    dcc.Graph(
        id='bar-graph',
        figure={}
    ),
    dcc.Interval(id='interval', interval=10000),

])

if __name__ == '__main__':
   app.run_server(debug=True) # Run the Dash app
 
Последнее редактирование:
Мы в соцсетях:

Обучение наступательной кибербезопасности в игровой форме. Начать игру!