Add real-time data fetching and configuration options
See README and .env.example
This commit is contained in:
		
							
								
								
									
										7
									
								
								.env.example
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								.env.example
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,7 @@
 | 
			
		||||
TOKEN=
 | 
			
		||||
TRAINING_DAYS=
 | 
			
		||||
TIMEFRAME=
 | 
			
		||||
MODEL=
 | 
			
		||||
REGION=
 | 
			
		||||
DATA_PROVIDER=
 | 
			
		||||
CG_API_KEY=
 | 
			
		||||
							
								
								
									
										13
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										13
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -9,6 +9,13 @@ inference-data
 | 
			
		||||
worker-data
 | 
			
		||||
 | 
			
		||||
config.json
 | 
			
		||||
env
 | 
			
		||||
env_file
 | 
			
		||||
.env
 | 
			
		||||
/data
 | 
			
		||||
 | 
			
		||||
**/*.venv*
 | 
			
		||||
**/.cache
 | 
			
		||||
**/.env
 | 
			
		||||
**/env_file
 | 
			
		||||
**/.gitkeep*
 | 
			
		||||
**/*.csv
 | 
			
		||||
**/*.pkl
 | 
			
		||||
**/*.zip
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# Use an official Python runtime as the base image
 | 
			
		||||
FROM amd64/python:3.9-buster as project_env
 | 
			
		||||
FROM python:3.11-slim as project_env
 | 
			
		||||
 | 
			
		||||
# Set the working directory in the container
 | 
			
		||||
WORKDIR /app
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										45
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										45
									
								
								README.md
									
									
									
									
									
								
							@ -1,12 +1,12 @@
 | 
			
		||||
# Basic ETH Price Prediction Node
 | 
			
		||||
# Basic Price Prediction Node
 | 
			
		||||
 | 
			
		||||
This repository provides an example Allora network worker node, designed to offer price predictions for ETH. The primary objective is to demonstrate the use of a basic inference model running within a dedicated container, showcasing its integration with the Allora network infrastructure to contribute valuable inferences.
 | 
			
		||||
This repository provides an example Allora network worker node, designed to offer price predictions. The primary objective is to demonstrate the use of a basic inference model running within a dedicated container, showcasing its integration with the Allora network infrastructure to contribute valuable inferences.
 | 
			
		||||
 | 
			
		||||
## Components
 | 
			
		||||
 | 
			
		||||
- **Worker**: The node that publishes inferences to the Allora chain.
 | 
			
		||||
- **Inference**: A container that conducts inferences, maintains the model state, and responds to internal inference requests via a Flask application. This node operates with a basic linear regression model for price predictions.
 | 
			
		||||
- **Updater**: A cron-like container designed to update the inference node's data by daily fetching the latest market information from Binance, ensuring the model stays current with new market trends.
 | 
			
		||||
- **Updater**: A cron-like container designed to update the inference node's data by daily fetching the latest market information from the data provider, ensuring the model stays current with new market trends.
 | 
			
		||||
 | 
			
		||||
Check the `docker-compose.yml` file for the detailed setup of each component.
 | 
			
		||||
 | 
			
		||||
@ -17,14 +17,45 @@ A complete working example is provided in the `docker-compose.yml` file.
 | 
			
		||||
### Steps to Setup
 | 
			
		||||
 | 
			
		||||
1. **Clone the Repository**
 | 
			
		||||
2. **Copy and Populate Configuration**
 | 
			
		||||
2. **Copy and Populate Model Configuration environment file**
 | 
			
		||||
    
 | 
			
		||||
    Copy the example .env.example file and populate it with your variables:
 | 
			
		||||
    ```sh
 | 
			
		||||
    cp .env.example .env
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
    Here are the currently accepted configurations:
 | 
			
		||||
    - TOKEN
 | 
			
		||||
    Must be one in ['ETH','SOL','BTC','BNB','ARB']. 
 | 
			
		||||
    Note: if you are using `Binance` as the data provider, any token could be used.
 | 
			
		||||
    If you are using Coingecko, you should add its `coin_id` in the [token_map here](https://github.com/allora-network/basic-coin-prediction-node/blob/main/updater.py#L107). Find [more info here](https://docs.coingecko.com/reference/simple-price) and the [list here](https://docs.google.com/spreadsheets/d/1wTTuxXt8n9q7C4NDXqQpI3wpKu1_5bGVmP9Xz0XGSyU/edit?usp=sharing).
 | 
			
		||||
    - TRAINING_DAYS
 | 
			
		||||
    Must be an `int` >= 1. 
 | 
			
		||||
    Represents how many days of historical data to use. 
 | 
			
		||||
    - TIMEFRAME
 | 
			
		||||
    This should be in this form: `10m`, `1h`, `1d`, etc.
 | 
			
		||||
    Note: For Coingecko, Data granularity (candle's body) is automatic - [see here](https://docs.coingecko.com/reference/coins-id-ohlc). To avoid downsampling, it is recommanded to use with Coingecko:
 | 
			
		||||
        - TIMEFRAME >= 30m if TRAINING_DAYS <= 2
 | 
			
		||||
        - TIMEFRAME >= 4h if TRAINING_DAYS <= 30
 | 
			
		||||
        - TIMEFRAME >= 4d if TRAINING_DAYS >= 31
 | 
			
		||||
    - MODEL
 | 
			
		||||
    Must be one in ['LinearRegression','SVR','KernelRidge','BayesianRidge']. 
 | 
			
		||||
    You can easily add support for any other models by [adding it here](https://github.com/allora-network/basic-coin-prediction-node/blob/main/model.py#L133).
 | 
			
		||||
    - REGION
 | 
			
		||||
    Must be `EU` or `US` - it is used for the Binance API. 
 | 
			
		||||
    - DATA_PROVIDER
 | 
			
		||||
    Must be `Binance` or `Coingecko`. Feel free to add support for other data providers to personalize your model!
 | 
			
		||||
    - CG_API_KEY
 | 
			
		||||
    This is your `Coingecko` API key, if you've set `DATA_PROVIDER=coingecko`.
 | 
			
		||||
 | 
			
		||||
3. **Copy and Populate Worker Configuration**
 | 
			
		||||
 | 
			
		||||
    Copy the example configuration file and populate it with your variables:
 | 
			
		||||
    ```sh
 | 
			
		||||
    cp config.example.json config.json
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
3. **Initialize Worker**
 | 
			
		||||
4. **Initialize Worker**
 | 
			
		||||
    
 | 
			
		||||
    Run the following commands from the project's root directory to initialize the worker:
 | 
			
		||||
    ```sh
 | 
			
		||||
@ -35,11 +66,11 @@ A complete working example is provided in the `docker-compose.yml` file.
 | 
			
		||||
    - Automatically create Allora keys for your worker.
 | 
			
		||||
    - Export the needed variables from the created account to be used by the worker node, bundle them with your provided `config.json`, and pass them to the node as environment variables.
 | 
			
		||||
 | 
			
		||||
4. **Faucet Your Worker Node**
 | 
			
		||||
5. **Faucet Your Worker Node**
 | 
			
		||||
    
 | 
			
		||||
    You can find the offchain worker node's address in `./worker-data/env_file` under `ALLORA_OFFCHAIN_ACCOUNT_ADDRESS`. [Add faucet funds](https://docs.allora.network/devs/get-started/setup-wallet#add-faucet-funds) to your worker's wallet before starting it.
 | 
			
		||||
 | 
			
		||||
5. **Start the Services**
 | 
			
		||||
6. **Start the Services**
 | 
			
		||||
    
 | 
			
		||||
    Run the following command to start the worker node, inference, and updater nodes:
 | 
			
		||||
    ```sh
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										33
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										33
									
								
								app.py
									
									
									
									
									
								
							@ -1,43 +1,28 @@
 | 
			
		||||
import json
 | 
			
		||||
import pickle
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from flask import Flask, jsonify, Response
 | 
			
		||||
from model import download_data, format_data, train_model
 | 
			
		||||
from config import model_file_path
 | 
			
		||||
from flask import Flask, Response
 | 
			
		||||
from model import download_data, format_data, train_model, get_inference
 | 
			
		||||
from config import model_file_path, TOKEN, TIMEFRAME, TRAINING_DAYS, REGION, DATA_PROVIDER
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
app = Flask(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def update_data():
 | 
			
		||||
    """Download price data, format data and train model."""
 | 
			
		||||
    download_data()
 | 
			
		||||
    format_data()
 | 
			
		||||
    train_model()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_eth_inference():
 | 
			
		||||
    """Load model and predict current price."""
 | 
			
		||||
    with open(model_file_path, "rb") as f:
 | 
			
		||||
        loaded_model = pickle.load(f)
 | 
			
		||||
 | 
			
		||||
    now_timestamp = pd.Timestamp(datetime.now()).timestamp()
 | 
			
		||||
    X_new = np.array([now_timestamp]).reshape(-1, 1)
 | 
			
		||||
    current_price_pred = loaded_model.predict(X_new)
 | 
			
		||||
 | 
			
		||||
    return current_price_pred[0][0]
 | 
			
		||||
    files = download_data(TOKEN, TRAINING_DAYS, REGION, DATA_PROVIDER)
 | 
			
		||||
    format_data(files, DATA_PROVIDER)
 | 
			
		||||
    train_model(TIMEFRAME)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.route("/inference/<string:token>")
 | 
			
		||||
def generate_inference(token):
 | 
			
		||||
    """Generate inference for given token."""
 | 
			
		||||
    if not token or token != "ETH":
 | 
			
		||||
    if not token or token.upper() != TOKEN:
 | 
			
		||||
        error_msg = "Token is required" if not token else "Token not supported"
 | 
			
		||||
        return Response(json.dumps({"error": error_msg}), status=400, mimetype='application/json')
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        inference = get_eth_inference()
 | 
			
		||||
        inference = get_inference(token.upper(), TIMEFRAME, REGION, DATA_PROVIDER)
 | 
			
		||||
        return Response(str(inference), status=200)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        return Response(json.dumps({"error": str(e)}), status=500, mimetype='application/json')
 | 
			
		||||
 | 
			
		||||
@ -3,12 +3,12 @@
 | 
			
		||||
        "addressKeyName": "test",
 | 
			
		||||
        "addressRestoreMnemonic": "",
 | 
			
		||||
        "alloraHomeDir": "",
 | 
			
		||||
        "gas": "1000000",
 | 
			
		||||
        "gasAdjustment": 1.0,
 | 
			
		||||
        "nodeRpc": "http://localhost:26657",
 | 
			
		||||
        "gas": "auto",
 | 
			
		||||
        "gasAdjustment": 1.5,
 | 
			
		||||
        "nodeRpc": "https://allora-rpc.testnet.allora.network",
 | 
			
		||||
        "maxRetries": 1,
 | 
			
		||||
        "delay": 1,
 | 
			
		||||
        "submitTx": false
 | 
			
		||||
        "submitTx": true
 | 
			
		||||
    },
 | 
			
		||||
    "worker": [
 | 
			
		||||
        {
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										16
									
								
								config.py
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								config.py
									
									
									
									
									
								
							@ -1,5 +1,21 @@
 | 
			
		||||
import os
 | 
			
		||||
from dotenv import load_dotenv
 | 
			
		||||
 | 
			
		||||
# Load environment variables from .env file
 | 
			
		||||
load_dotenv()
 | 
			
		||||
 | 
			
		||||
app_base_path = os.getenv("APP_BASE_PATH", default=os.getcwd())
 | 
			
		||||
data_base_path = os.path.join(app_base_path, "data")
 | 
			
		||||
model_file_path = os.path.join(data_base_path, "model.pkl")
 | 
			
		||||
 | 
			
		||||
TOKEN = os.getenv("TOKEN").upper()
 | 
			
		||||
TRAINING_DAYS = os.getenv("TRAINING_DAYS")
 | 
			
		||||
TIMEFRAME = os.getenv("TIMEFRAME")
 | 
			
		||||
MODEL = os.getenv("MODEL")
 | 
			
		||||
REGION = os.getenv("REGION").lower()
 | 
			
		||||
if REGION in ["us", "com", "usa"]:
 | 
			
		||||
    REGION = "us"
 | 
			
		||||
else:
 | 
			
		||||
    REGION = "com"
 | 
			
		||||
DATA_PROVIDER = os.getenv("DATA_PROVIDER").lower()
 | 
			
		||||
CG_API_KEY = os.getenv("CG_API_KEY", default=None)
 | 
			
		||||
 | 
			
		||||
@ -1,12 +1,14 @@
 | 
			
		||||
services:
 | 
			
		||||
  inference:
 | 
			
		||||
    container_name: inference-basic-eth-pred
 | 
			
		||||
    env_file:
 | 
			
		||||
      - .env
 | 
			
		||||
    build: .
 | 
			
		||||
    command: python -u /app/app.py
 | 
			
		||||
    ports:
 | 
			
		||||
      - "8000:8000"
 | 
			
		||||
    healthcheck:
 | 
			
		||||
      test: ["CMD", "curl", "-f", "http://localhost:8000/inference/ETH"]
 | 
			
		||||
      test: ["CMD", "curl", "-f", "http://localhost:8000/inference/${TOKEN}"]
 | 
			
		||||
      interval: 10s
 | 
			
		||||
      timeout: 5s
 | 
			
		||||
      retries: 12
 | 
			
		||||
@ -31,7 +33,7 @@ services:
 | 
			
		||||
 | 
			
		||||
  worker:
 | 
			
		||||
    container_name: worker
 | 
			
		||||
    image: alloranetwork/allora-offchain-node:latest
 | 
			
		||||
    image: alloranetwork/allora-offchain-node:v0.3.0
 | 
			
		||||
    volumes:
 | 
			
		||||
      - ./worker-data:/data
 | 
			
		||||
    depends_on:
 | 
			
		||||
 | 
			
		||||
@ -36,7 +36,7 @@ ENV_LOADED=$(grep '^ENV_LOADED=' ./worker-data/env_file | cut -d '=' -f 2)
 | 
			
		||||
if [ "$ENV_LOADED" = "false" ]; then
 | 
			
		||||
    json_content=$(cat ./config.json)
 | 
			
		||||
    stringified_json=$(echo "$json_content" | jq -c .)
 | 
			
		||||
    docker run -it --entrypoint=bash -v $(pwd)/worker-data:/data -v $(pwd)/scripts:/scripts -e NAME="${nodeName}" -e ALLORA_OFFCHAIN_NODE_CONFIG_JSON="${stringified_json}" alloranetwork/allora-chain:latest -c "bash /scripts/init.sh"
 | 
			
		||||
    docker run -it --entrypoint=bash -v $(pwd)/worker-data:/data -v $(pwd)/scripts:/scripts -e NAME="${nodeName}" -e ALLORA_OFFCHAIN_NODE_CONFIG_JSON="${stringified_json}" alloranetwork/allora-chain:v0.4.0 -c "bash /scripts/init.sh"
 | 
			
		||||
    echo "config.json saved to ./worker-data/env_file"
 | 
			
		||||
else
 | 
			
		||||
    echo "config.json is already loaded, skipping the operation. You can set ENV_LOADED variable to false in ./worker-data/env_file to reload the config.json"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										196
									
								
								model.py
									
									
									
									
									
								
							
							
						
						
									
										196
									
								
								model.py
									
									
									
									
									
								
							@ -1,98 +1,141 @@
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import pickle
 | 
			
		||||
from zipfile import ZipFile
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
import pandas as pd
 | 
			
		||||
from sklearn.model_selection import train_test_split
 | 
			
		||||
from sklearn.linear_model import LinearRegression
 | 
			
		||||
from updater import download_binance_monthly_data, download_binance_daily_data
 | 
			
		||||
from config import data_base_path, model_file_path
 | 
			
		||||
from sklearn.kernel_ridge import KernelRidge
 | 
			
		||||
from sklearn.linear_model import BayesianRidge, LinearRegression
 | 
			
		||||
from sklearn.svm import SVR
 | 
			
		||||
from updater import download_binance_daily_data, download_binance_current_day_data, download_coingecko_data, download_coingecko_current_day_data
 | 
			
		||||
from config import data_base_path, model_file_path, TOKEN, MODEL, CG_API_KEY
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
binance_data_path = os.path.join(data_base_path, "binance/futures-klines")
 | 
			
		||||
training_price_data_path = os.path.join(data_base_path, "eth_price_data.csv")
 | 
			
		||||
binance_data_path = os.path.join(data_base_path, "binance")
 | 
			
		||||
coingecko_data_path = os.path.join(data_base_path, "coingecko")
 | 
			
		||||
training_price_data_path = os.path.join(data_base_path, "price_data.csv")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def download_data():
 | 
			
		||||
    cm_or_um = "um"
 | 
			
		||||
    symbols = ["ETHUSDT"]
 | 
			
		||||
    intervals = ["1d"]
 | 
			
		||||
    years = ["2020", "2021", "2022", "2023", "2024"]
 | 
			
		||||
    months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
 | 
			
		||||
    download_path = binance_data_path
 | 
			
		||||
    download_binance_monthly_data(
 | 
			
		||||
        cm_or_um, symbols, intervals, years, months, download_path
 | 
			
		||||
    )
 | 
			
		||||
    print(f"Downloaded monthly data to {download_path}.")
 | 
			
		||||
    current_datetime = datetime.now()
 | 
			
		||||
    current_year = current_datetime.year
 | 
			
		||||
    current_month = current_datetime.month
 | 
			
		||||
    download_binance_daily_data(
 | 
			
		||||
        cm_or_um, symbols, intervals, current_year, current_month, download_path
 | 
			
		||||
    )
 | 
			
		||||
    print(f"Downloaded daily data to {download_path}.")
 | 
			
		||||
def download_data_binance(token, training_days, region):
 | 
			
		||||
    files = download_binance_daily_data(f"{token}USDT", training_days, region, binance_data_path)
 | 
			
		||||
    print(f"Downloaded {len(files)} new files")
 | 
			
		||||
    return files
 | 
			
		||||
 | 
			
		||||
def download_data_coingecko(token, training_days):
 | 
			
		||||
    files = download_coingecko_data(token, training_days, coingecko_data_path, CG_API_KEY)
 | 
			
		||||
    print(f"Downloaded {len(files)} new files")
 | 
			
		||||
    return files
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def format_data():
 | 
			
		||||
    files = sorted([x for x in os.listdir(binance_data_path)])
 | 
			
		||||
def download_data(token, training_days, region, data_provider):
 | 
			
		||||
    if data_provider == "coingecko":
 | 
			
		||||
        return download_data_coingecko(token, int(training_days))
 | 
			
		||||
    elif data_provider == "binance":
 | 
			
		||||
        return download_data_binance(token, training_days, region)
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError("Unsupported data provider")
 | 
			
		||||
    
 | 
			
		||||
def format_data(files, data_provider):
 | 
			
		||||
    if not files:
 | 
			
		||||
        print("Already up to date")
 | 
			
		||||
        return
 | 
			
		||||
    
 | 
			
		||||
    if data_provider == "binance":
 | 
			
		||||
        files = sorted([x for x in os.listdir(binance_data_path) if x.startswith(f"{TOKEN}USDT")])
 | 
			
		||||
    elif data_provider == "coingecko":
 | 
			
		||||
        files = sorted([x for x in os.listdir(coingecko_data_path) if x.endswith(".json")])
 | 
			
		||||
 | 
			
		||||
    # No files to process
 | 
			
		||||
    if len(files) == 0:
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    price_df = pd.DataFrame()
 | 
			
		||||
    for file in files:
 | 
			
		||||
        zip_file_path = os.path.join(binance_data_path, file)
 | 
			
		||||
    if data_provider == "binance":
 | 
			
		||||
        for file in files:
 | 
			
		||||
            zip_file_path = os.path.join(binance_data_path, file)
 | 
			
		||||
 | 
			
		||||
        if not zip_file_path.endswith(".zip"):
 | 
			
		||||
            continue
 | 
			
		||||
            if not zip_file_path.endswith(".zip"):
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
        myzip = ZipFile(zip_file_path)
 | 
			
		||||
        with myzip.open(myzip.filelist[0]) as f:
 | 
			
		||||
            line = f.readline()
 | 
			
		||||
            header = 0 if line.decode("utf-8").startswith("open_time") else None
 | 
			
		||||
        df = pd.read_csv(myzip.open(myzip.filelist[0]), header=header).iloc[:, :11]
 | 
			
		||||
        df.columns = [
 | 
			
		||||
            "start_time",
 | 
			
		||||
            "open",
 | 
			
		||||
            "high",
 | 
			
		||||
            "low",
 | 
			
		||||
            "close",
 | 
			
		||||
            "volume",
 | 
			
		||||
            "end_time",
 | 
			
		||||
            "volume_usd",
 | 
			
		||||
            "n_trades",
 | 
			
		||||
            "taker_volume",
 | 
			
		||||
            "taker_volume_usd",
 | 
			
		||||
        ]
 | 
			
		||||
        df.index = [pd.Timestamp(x + 1, unit="ms") for x in df["end_time"]]
 | 
			
		||||
        df.index.name = "date"
 | 
			
		||||
        price_df = pd.concat([price_df, df])
 | 
			
		||||
            myzip = ZipFile(zip_file_path)
 | 
			
		||||
            with myzip.open(myzip.filelist[0]) as f:
 | 
			
		||||
                line = f.readline()
 | 
			
		||||
                header = 0 if line.decode("utf-8").startswith("open_time") else None
 | 
			
		||||
            df = pd.read_csv(myzip.open(myzip.filelist[0]), header=header).iloc[:, :11]
 | 
			
		||||
            df.columns = [
 | 
			
		||||
                "start_time",
 | 
			
		||||
                "open",
 | 
			
		||||
                "high",
 | 
			
		||||
                "low",
 | 
			
		||||
                "close",
 | 
			
		||||
                "volume",
 | 
			
		||||
                "end_time",
 | 
			
		||||
                "volume_usd",
 | 
			
		||||
                "n_trades",
 | 
			
		||||
                "taker_volume",
 | 
			
		||||
                "taker_volume_usd",
 | 
			
		||||
            ]
 | 
			
		||||
            df.index = [pd.Timestamp(x + 1, unit="ms").to_datetime64() for x in df["end_time"]]
 | 
			
		||||
            df.index.name = "date"
 | 
			
		||||
            price_df = pd.concat([price_df, df])
 | 
			
		||||
 | 
			
		||||
    price_df.sort_index().to_csv(training_price_data_path)
 | 
			
		||||
            price_df.sort_index().to_csv(training_price_data_path)
 | 
			
		||||
    elif data_provider == "coingecko":
 | 
			
		||||
        for file in files:
 | 
			
		||||
            with open(os.path.join(coingecko_data_path, file), "r") as f:
 | 
			
		||||
                data = json.load(f)
 | 
			
		||||
                df = pd.DataFrame(data)
 | 
			
		||||
                df.columns = [
 | 
			
		||||
                    "timestamp",
 | 
			
		||||
                    "open",
 | 
			
		||||
                    "high",
 | 
			
		||||
                    "low",
 | 
			
		||||
                    "close"
 | 
			
		||||
                ]
 | 
			
		||||
                df["date"] = pd.to_datetime(df["timestamp"], unit="ms")
 | 
			
		||||
                df.drop(columns=["timestamp"], inplace=True)
 | 
			
		||||
                df.set_index("date", inplace=True)
 | 
			
		||||
                price_df = pd.concat([price_df, df])
 | 
			
		||||
 | 
			
		||||
            price_df.sort_index().to_csv(training_price_data_path)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def train_model():
 | 
			
		||||
    # Load the eth price data
 | 
			
		||||
def load_frame(frame, timeframe):
 | 
			
		||||
    print(f"Loading data...")
 | 
			
		||||
    df = frame.loc[:,['open','high','low','close']].dropna()
 | 
			
		||||
    df[['open','high','low','close']] = df[['open','high','low','close']].apply(pd.to_numeric)
 | 
			
		||||
    df['date'] = frame['date'].apply(pd.to_datetime)
 | 
			
		||||
    df.set_index('date', inplace=True)
 | 
			
		||||
    df.sort_index(inplace=True)
 | 
			
		||||
 | 
			
		||||
    return df.resample(f'{timeframe}', label='right', closed='right', origin='end').mean()
 | 
			
		||||
 | 
			
		||||
def train_model(timeframe):
 | 
			
		||||
    # Load the price data
 | 
			
		||||
    price_data = pd.read_csv(training_price_data_path)
 | 
			
		||||
    df = pd.DataFrame()
 | 
			
		||||
    df = load_frame(price_data, timeframe)
 | 
			
		||||
 | 
			
		||||
    # Convert 'date' to a numerical value (timestamp) we can use for regression
 | 
			
		||||
    df["date"] = pd.to_datetime(price_data["date"])
 | 
			
		||||
    df["date"] = df["date"].map(pd.Timestamp.timestamp)
 | 
			
		||||
    print(df.tail())
 | 
			
		||||
 | 
			
		||||
    df["price"] = price_data[["open", "close", "high", "low"]].mean(axis=1)
 | 
			
		||||
    y_train = df['close'].shift(-1).dropna().values
 | 
			
		||||
    X_train = df[:-1]
 | 
			
		||||
 | 
			
		||||
    # Reshape the data to the shape expected by sklearn
 | 
			
		||||
    x = df["date"].values.reshape(-1, 1)
 | 
			
		||||
    y = df["price"].values.reshape(-1, 1)
 | 
			
		||||
    print(f"Training data shape: {X_train.shape}, {y_train.shape}")
 | 
			
		||||
 | 
			
		||||
    # Split the data into training set and test set
 | 
			
		||||
    x_train, _, y_train, _ = train_test_split(x, y, test_size=0.2, random_state=0)
 | 
			
		||||
    # Define the model
 | 
			
		||||
    if MODEL == "LinearRegression":
 | 
			
		||||
        model = LinearRegression()
 | 
			
		||||
    elif MODEL == "SVR":
 | 
			
		||||
        model = SVR()
 | 
			
		||||
    elif MODEL == "KernelRidge":
 | 
			
		||||
        model = KernelRidge()
 | 
			
		||||
    elif MODEL == "BayesianRidge":
 | 
			
		||||
        model = BayesianRidge()
 | 
			
		||||
    # Add more models here
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError("Unsupported model")
 | 
			
		||||
    
 | 
			
		||||
    # Train the model
 | 
			
		||||
    model = LinearRegression()
 | 
			
		||||
    model.fit(x_train, y_train)
 | 
			
		||||
    model.fit(X_train, y_train)
 | 
			
		||||
 | 
			
		||||
    # create the model's parent directory if it doesn't exist
 | 
			
		||||
    os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
 | 
			
		||||
@ -102,3 +145,22 @@ def train_model():
 | 
			
		||||
        pickle.dump(model, f)
 | 
			
		||||
 | 
			
		||||
    print(f"Trained model saved to {model_file_path}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_inference(token, timeframe, region, data_provider):
 | 
			
		||||
    """Load model and predict current price."""
 | 
			
		||||
    with open(model_file_path, "rb") as f:
 | 
			
		||||
        loaded_model = pickle.load(f)
 | 
			
		||||
 | 
			
		||||
    # Get current price
 | 
			
		||||
    if data_provider == "coingecko":
 | 
			
		||||
        X_new = load_frame(download_coingecko_current_day_data(token, CG_API_KEY), timeframe)
 | 
			
		||||
    else:
 | 
			
		||||
        X_new = load_frame(download_binance_current_day_data(f"{TOKEN}USDT", region), timeframe)
 | 
			
		||||
    
 | 
			
		||||
    print(X_new.tail())
 | 
			
		||||
    print(X_new.shape)
 | 
			
		||||
 | 
			
		||||
    current_price_pred = loaded_model.predict(X_new)
 | 
			
		||||
 | 
			
		||||
    return current_price_pred[0]
 | 
			
		||||
@ -1,7 +1,9 @@
 | 
			
		||||
flask[async]
 | 
			
		||||
gunicorn[gthread]
 | 
			
		||||
numpy==1.26.2
 | 
			
		||||
pandas==2.1.3
 | 
			
		||||
Requests==2.32.0
 | 
			
		||||
scikit_learn==1.3.2
 | 
			
		||||
werkzeug>=3.0.3 # not directly required, pinned by Snyk to avoid a vulnerability
 | 
			
		||||
numpy
 | 
			
		||||
pandas
 | 
			
		||||
Requests
 | 
			
		||||
aiohttp
 | 
			
		||||
multiprocess
 | 
			
		||||
scikit_learn
 | 
			
		||||
python-dotenv
 | 
			
		||||
							
								
								
									
										204
									
								
								updater.py
									
									
									
									
									
								
							
							
						
						
									
										204
									
								
								updater.py
									
									
									
									
									
								
							@ -1,59 +1,175 @@
 | 
			
		||||
import os
 | 
			
		||||
from datetime import date, timedelta
 | 
			
		||||
import pathlib
 | 
			
		||||
import time
 | 
			
		||||
import requests
 | 
			
		||||
from requests.adapters import HTTPAdapter
 | 
			
		||||
from urllib3.util import Retry
 | 
			
		||||
from concurrent.futures import ThreadPoolExecutor
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Define the retry strategy
 | 
			
		||||
retry_strategy = Retry(
 | 
			
		||||
    total=4,  # Maximum number of retries
 | 
			
		||||
    backoff_factor=2, # Exponential backoff factor (e.g., 2 means 1, 2, 4, 8 seconds, ...)
 | 
			
		||||
    status_forcelist=[429, 500, 502, 503, 504],  # HTTP status codes to retry on
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# Create an HTTP adapter with the retry strategy and mount it to session
 | 
			
		||||
adapter = HTTPAdapter(max_retries=retry_strategy)
 | 
			
		||||
 | 
			
		||||
# Create a new session object
 | 
			
		||||
session = requests.Session()
 | 
			
		||||
session.mount('http://', adapter)
 | 
			
		||||
session.mount('https://', adapter)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
files = []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Function to download the URL, called asynchronously by several child processes
 | 
			
		||||
def download_url(url, download_path):
 | 
			
		||||
    target_file_path = os.path.join(download_path, os.path.basename(url)) 
 | 
			
		||||
    if os.path.exists(target_file_path):
 | 
			
		||||
        # print(f"File already exists: {url}")
 | 
			
		||||
        return
 | 
			
		||||
def download_url(url, download_path, name=None):
 | 
			
		||||
    try:
 | 
			
		||||
        global files
 | 
			
		||||
        if name:
 | 
			
		||||
            file_name = os.path.join(download_path, name)
 | 
			
		||||
        else:
 | 
			
		||||
            file_name = os.path.join(download_path, os.path.basename(url))
 | 
			
		||||
        dir_path = os.path.dirname(file_name)
 | 
			
		||||
        pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True) 
 | 
			
		||||
        if os.path.isfile(file_name):
 | 
			
		||||
            # print(f"{file_name} already exists")
 | 
			
		||||
            return
 | 
			
		||||
        # Make a request using the session object
 | 
			
		||||
        response = session.get(url)
 | 
			
		||||
        if response.status_code == 404:
 | 
			
		||||
            print(f"File does not exist: {url}")
 | 
			
		||||
        elif response.status_code == 200:
 | 
			
		||||
            with open(file_name, 'wb') as f:
 | 
			
		||||
                f.write(response.content)
 | 
			
		||||
            # print(f"Downloaded: {url} to {file_name}")
 | 
			
		||||
            files.append(file_name)
 | 
			
		||||
            return
 | 
			
		||||
        else:
 | 
			
		||||
            print(f"Failed to download {url}")
 | 
			
		||||
            return
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print(str(e))
 | 
			
		||||
 | 
			
		||||
    response = requests.get(url)
 | 
			
		||||
    if response.status_code == 404:
 | 
			
		||||
        # print(f"File not exist: {url}")
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
# Function to generate a range of dates
 | 
			
		||||
def daterange(start_date, end_date):
 | 
			
		||||
    for n in range(int((end_date - start_date).days)):
 | 
			
		||||
        yield start_date + timedelta(n)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Function to download daily data from Binance
 | 
			
		||||
def download_binance_daily_data(pair, training_days, region, download_path):
 | 
			
		||||
    base_url = f"https://data.binance.vision/data/spot/daily/klines"
 | 
			
		||||
 | 
			
		||||
    end_date = date.today()
 | 
			
		||||
    start_date = end_date - timedelta(days=int(training_days))
 | 
			
		||||
    
 | 
			
		||||
    global files
 | 
			
		||||
    files = []
 | 
			
		||||
 | 
			
		||||
    with ThreadPoolExecutor() as executor:
 | 
			
		||||
        print(f"Downloading data for {pair}")
 | 
			
		||||
        for single_date in daterange(start_date, end_date):
 | 
			
		||||
            url = f"{base_url}/{pair}/1m/{pair}-1m-{single_date}.zip"
 | 
			
		||||
            executor.submit(download_url, url, download_path)
 | 
			
		||||
    
 | 
			
		||||
    return files
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def download_binance_current_day_data(pair, region):
 | 
			
		||||
    limit = 1000
 | 
			
		||||
    base_url = f'https://api.binance.{region}/api/v3/klines?symbol={pair}&interval=1m&limit={limit}'
 | 
			
		||||
 | 
			
		||||
    # Make a request using the session object
 | 
			
		||||
    response = session.get(base_url)
 | 
			
		||||
    response.raise_for_status()
 | 
			
		||||
    resp = str(response.content, 'utf-8').rstrip()
 | 
			
		||||
 | 
			
		||||
    columns = ['start_time','open','high','low','close','volume','end_time','volume_usd','n_trades','taker_volume','taker_volume_usd','ignore']
 | 
			
		||||
    
 | 
			
		||||
    df = pd.DataFrame(json.loads(resp),columns=columns)
 | 
			
		||||
    df['date'] = [pd.to_datetime(x+1,unit='ms') for x in df['end_time']]
 | 
			
		||||
    df['date'] = df['date'].apply(pd.to_datetime)
 | 
			
		||||
    df[["volume", "taker_volume", "open", "high", "low", "close"]] = df[["volume", "taker_volume", "open", "high", "low", "close"]].apply(pd.to_numeric)
 | 
			
		||||
 | 
			
		||||
    return df.sort_index()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_coingecko_coin_id(token):
 | 
			
		||||
    token_map = {
 | 
			
		||||
        'ETH': 'ethereum',
 | 
			
		||||
        'SOL': 'solana',
 | 
			
		||||
        'BTC': 'bitcoin',
 | 
			
		||||
        'BNB': 'binancecoin',
 | 
			
		||||
        'ARB': 'arbitrum',
 | 
			
		||||
        # Add more tokens here
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    token = token.upper()
 | 
			
		||||
    if token in token_map:
 | 
			
		||||
        return token_map[token]
 | 
			
		||||
    else:
 | 
			
		||||
 | 
			
		||||
        # create the entire path if it doesn't exist
 | 
			
		||||
        os.makedirs(os.path.dirname(target_file_path), exist_ok=True)
 | 
			
		||||
 | 
			
		||||
        with open(target_file_path, "wb") as f:
 | 
			
		||||
            f.write(response.content)
 | 
			
		||||
        # print(f"Downloaded: {url} to {target_file_path}")
 | 
			
		||||
        raise ValueError("Unsupported token")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def download_binance_monthly_data(
 | 
			
		||||
    cm_or_um, symbols, intervals, years, months, download_path
 | 
			
		||||
):
 | 
			
		||||
    # Verify if CM_OR_UM is correct, if not, exit
 | 
			
		||||
    if cm_or_um not in ["cm", "um"]:
 | 
			
		||||
        print("CM_OR_UM can be only cm or um")
 | 
			
		||||
        return
 | 
			
		||||
    base_url = f"https://data.binance.vision/data/futures/{cm_or_um}/monthly/klines"
 | 
			
		||||
def download_coingecko_data(token, training_days, download_path, CG_API_KEY):
 | 
			
		||||
    if training_days <= 7:
 | 
			
		||||
        days = 7
 | 
			
		||||
    elif training_days <= 14:
 | 
			
		||||
        days = 14
 | 
			
		||||
    elif training_days <= 30:
 | 
			
		||||
        days = 30
 | 
			
		||||
    elif training_days <= 90:
 | 
			
		||||
        days = 90
 | 
			
		||||
    elif training_days <= 180:
 | 
			
		||||
        days = 180
 | 
			
		||||
    elif training_days <= 365:
 | 
			
		||||
        days = 365
 | 
			
		||||
    else:
 | 
			
		||||
        days = "max"
 | 
			
		||||
    print(f"Days: {days}")
 | 
			
		||||
 | 
			
		||||
    # Main loop to iterate over all the arrays and launch child processes
 | 
			
		||||
    with ThreadPoolExecutor() as executor:
 | 
			
		||||
        for symbol in symbols:
 | 
			
		||||
            for interval in intervals:
 | 
			
		||||
                for year in years:
 | 
			
		||||
                    for month in months:
 | 
			
		||||
                        url = f"{base_url}/{symbol}/{interval}/{symbol}-{interval}-{year}-{month}.zip"
 | 
			
		||||
                        executor.submit(download_url, url, download_path)
 | 
			
		||||
    coin_id = get_coingecko_coin_id(token)
 | 
			
		||||
    print(f"Coin ID: {coin_id}")
 | 
			
		||||
 | 
			
		||||
    # Get OHLC data from Coingecko
 | 
			
		||||
    url = f'https://api.coingecko.com/api/v3/coins/{coin_id}/ohlc?vs_currency=usd&days={days}&api_key={CG_API_KEY}'
 | 
			
		||||
 | 
			
		||||
def download_binance_daily_data(
 | 
			
		||||
    cm_or_um, symbols, intervals, year, month, download_path
 | 
			
		||||
):
 | 
			
		||||
    if cm_or_um not in ["cm", "um"]:
 | 
			
		||||
        print("CM_OR_UM can be only cm or um")
 | 
			
		||||
        return
 | 
			
		||||
    base_url = f"https://data.binance.vision/data/futures/{cm_or_um}/daily/klines"
 | 
			
		||||
    global files
 | 
			
		||||
    files = []
 | 
			
		||||
 | 
			
		||||
    with ThreadPoolExecutor() as executor:
 | 
			
		||||
        for symbol in symbols:
 | 
			
		||||
            for interval in intervals:
 | 
			
		||||
                for day in range(1, 32):  # Assuming days range from 1 to 31
 | 
			
		||||
                    url = f"{base_url}/{symbol}/{interval}/{symbol}-{interval}-{year}-{month:02d}-{day:02d}.zip"
 | 
			
		||||
                    executor.submit(download_url, url, download_path)
 | 
			
		||||
        print(f"Downloading data for {coin_id}")
 | 
			
		||||
        name = os.path.basename(url).split("?")[0].replace("/", "_") + ".json"
 | 
			
		||||
        executor.submit(download_url, url, download_path, name)
 | 
			
		||||
    
 | 
			
		||||
    return files
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def download_coingecko_current_day_data(token, CG_API_KEY):
 | 
			
		||||
    coin_id = get_coingecko_coin_id(token)
 | 
			
		||||
    print(f"Coin ID: {coin_id}")
 | 
			
		||||
 | 
			
		||||
    url = f'https://api.coingecko.com/api/v3/coins/{coin_id}/ohlc?vs_currency=usd&days=1&api_key={CG_API_KEY}'
 | 
			
		||||
 | 
			
		||||
    # Make a request using the session object
 | 
			
		||||
    response = session.get(url)
 | 
			
		||||
    response.raise_for_status()
 | 
			
		||||
    resp = str(response.content, 'utf-8').rstrip()
 | 
			
		||||
 | 
			
		||||
    columns = ['timestamp','open','high','low','close']
 | 
			
		||||
    
 | 
			
		||||
    df = pd.DataFrame(json.loads(resp), columns=columns)
 | 
			
		||||
    df['date'] = [pd.to_datetime(x,unit='ms') for x in df['timestamp']]
 | 
			
		||||
    df['date'] = df['date'].apply(pd.to_datetime)
 | 
			
		||||
    df[["open", "high", "low", "close"]] = df[["open", "high", "low", "close"]].apply(pd.to_numeric)
 | 
			
		||||
 | 
			
		||||
    return df.sort_index()
 | 
			
		||||
 | 
			
		||||
		Reference in New Issue
	
	Block a user