Build a Custom Dataset
Create a custom training dataset with EODHD¶
Learn how to create custom crypto training datasets (features and targets) from scratch. This tutorial demonstrates the full pipeline from data download to ML-ready features and modeling.
What you'll learn:
- Download historical crypto data with numerblox and eodhd
- Engineer cross-sectional features with centimators
- Create ranking targets for prediction
- Validate data quality
- Structure datasets for ML
Why build custom datasets? While CrowdCent provides training data, building your own allows you to:
- Experiment with different features and time windows
- Include additional data sources
- Test hypotheses on historical data
- Develop a unique edge in predictions
Get EODHD API Access¶
Special offer for CrowdCent users: Get 10% off all EODHD plans 👉 https://eodhd.com/pricing-special-10?via=crowdcent
Recommended plan: EOD Historical Data - All World ($17.99/mo) which provides 100k calls/day - sufficient for most use cases
!pip install crowdcent-challenge numerblox eod centimators polars altair vegafusion vl-convert-python
import os
import polars as pl
from datetime import datetime
import requests
# Set your EODHD API key
EOD_API_KEY = "YOUR_EODHD_API_KEY_HERE" # Get from https://eodhd.com/cp/dashboard
if EOD_API_KEY == "YOUR_EODHD_API_KEY_HERE":
print("⚠️ Please set your EODHD API key above")
print("Get one free at: https://eodhd.com/pricing-special-10?via=crowdcent")
Step 1: Fetch Live Hyperliquid Universe¶
We'll get the current cryptocurrency universe directly from the Hyperliquid API. This includes both active and delisted perpetuals, giving us maximum historical data coverage.
def get_hyperliquid_perpetuals(include_delisted: bool = False):
"""Get perpetual IDs as a polars DataFrame
Args:
include_delisted: If True, include delisted perpetuals in the output
"""
url = "https://api.hyperliquid.xyz/info"
payload = {"type": "meta"}
special_map_dict = {
"POPCAT-USD.CC": "POPCAT28782-USD.CC",
"VVV-USD.CC": "VVV.CC",
"BRETT-USD.CC": "BRETT29743-USD.CC",
"UNIBOT-USD.CC": "UNIBOT27009-USD.CC",
"ZRO-USD.CC": "ZRO26997-USD.CC",
"MOVE-USD.CC": "MOVE32452-USD.CC",
"STG-USD.CC": "STG18934-USD.CC",
"GOAT-USD.CC": "GOAT33440-USD.CC",
"PEPE-USD.CC": "PEPE24478-USD.CC",
"PROMPT-USD.CC": "PROMPT-USD.CC",
"NIL-USD.CC": "NIL35702-USD.CC",
"MNT-USD.CC": "MNT27075-USD.CC",
"ACE-USD.CC": "ACE28674-USD.CC",
"HYPE-USD.CC": "HYPE32196-USD.CC",
"IMX-USD.CC": "IMX10603-USD.CC",
"INIT-USD.CC": "INIT-USD.CC",
"PURR-USD.CC": "PURR34332-USD.CC",
"MOODENG-USD.CC": "MOODENG33093-USD.CC",
"CHILLGUY-USD.CC": "CHILLGUY-USD.CC",
"FARTCOIN-USD.CC": "FARTCOIN-USD.CC",
"GRASS-USD.CC": "GRASS32956-USD.CC",
"GRIFFAIN-USD.CC": "GRIFFAIN-USD.CC",
"MELANIA-USD.CC": "MELANIA35347-USD.CC",
"KAITO-USD.CC": "KAITO-USD.CC",
"SUI-USD.CC": "SUI20947-USD.CC",
"BERA-USD.CC": "BERA-USD.CC",
"MEW-USD.CC": "MEW30126-USD.CC",
"ANIME-USD.CC": "ANIME35319-USD.CC",
"NEIRO-USD.CC": "NEIRO32521-USD.CC",
"DOGS-USD.CC": "DOGS32698-USD.CC",
"STX-USD.CC": "STX4847-USD.CC",
"S-USD.CC": "S32684-USD.CC",
"COMP-USD.CC": "COMP5692-USD.CC",
"TRUMP-USD.CC": "TRUMP-OFFICIAL-USD.CC",
"BLAST-USD.CC": "BLAST28480-USD.CC",
"TAO-USD.CC": "TAO22974-USD.CC",
"SAGA-USD.CC": "SAGA30372-USD.CC",
"TON-USD.CC": "TON11419-USD.CC",
"BIO-USD.CC": "BIO.CC",
"GMX-USD.CC": "GMX11857-USD.CC",
"NTRN-USD.CC": "NTRN26680-USD.CC",
"SUPER-USD.CC": "SUPER8290-USD.CC",
"SCR-USD.CC": "SCR26998-USD.CC",
"BANANA-USD.CC": "BANANA28066-USD.CC",
"ME-USD.CC": "ME32197-USD.CC",
"GMT-USD.CC": "GMT18069-USD.CC",
"IO-USD.CC": "IO29835-USD.CC",
"ZK-USD.CC": "ZKSYNC.CC",
"ALT-USD.CC": "ALT29073-USD.CC",
"POL-USD.CC": "POL28321-USD.CC",
"WCT-USD.CC": "WCT33152-USD.CC",
"XAI-USD.CC": "XAI28933-USD.CC",
"JUP-USD.CC": "JUP29210-USD.CC",
"APE-USD.CC": "APE3-USD.CC",
"SPX-USD.CC": "SPX28081-USD.CC",
"HYPER-USD.CC": "HYPER36281-USD.CC",
"IP-USD.CC": "IP-USD.CC",
"ZORA-USD.CC": "ZORA35931-USD.CC",
"PEOPLE-USD.CC": "PEOPLE-USD.CC",
"BABY-USD.CC": "BABY32198-USD.CC",
"ARB-USD.CC": "ARB11841-USD.CC",
"UNI-USD.CC": "UNI7083-USD.CC",
"OMNI-USD.CC": "OMNI30315-USD.CC",
"SOPH-USD.CC": "SOPHON-USD.CC",
"NEIROETH-USD.CC": "NEIRO-USD.CC",
"APT-USD.CC": "APT21794-USD.CC",
"STRK-USD.CC": "STRK22691-USD.CC",
"RESOLV-USD.CC": "RESOLV-USD.CC",
"TST-USD.CC": "TST35647-USD.CC",
"PUMP-USD.CC": "PUMP29601-USD.CC",
"WLFI-USD.CC": "WLFI33251-USD.CC",
"ASTER-USD.CC": "ASTER36341-USD.CC",
"SKY-USD.CC": "SKY33038-USD.CC",
}
response = requests.post(url, json=payload)
response.raise_for_status()
data = response.json()
universe = data.get("universe", [])
# Extract names of perpetuals based on delisted status
perpetual_ids = [
perp["name"]
for perp in universe
if include_delisted or not perp.get("isDelisted", False)
]
# Create DataFrame with just id column
df = pl.DataFrame({"id": perpetual_ids})
df = df.with_columns(
(pl.col("id") + "-USD.CC")
.str.replace("k", "")
.replace(special_map_dict)
.alias("eodhd_id")
)
return df
# Get the Hyperliquid universe
perpetuals_df = get_hyperliquid_perpetuals(include_delisted=True)
print(perpetuals_df)
# Extract ticker lists
eodhd_tickers = perpetuals_df["eodhd_id"].to_list()
id_mapping = dict(perpetuals_df.select("eodhd_id", "id").iter_rows())
print(f"\n📊 Dataset will include {len(eodhd_tickers)} cryptocurrencies")
print(f"Sample tickers: {eodhd_tickers[:10]}")
print(f"Sample IDs: {list(id_mapping.values())[:10]}")
shape: (216, 2) ┌───────┬──────────────┐ │ id ┆ eodhd_id │ │ --- ┆ --- │ │ str ┆ str │ ╞═══════╪══════════════╡ │ BTC ┆ BTC-USD.CC │ │ ETH ┆ ETH-USD.CC │ │ ATOM ┆ ATOM-USD.CC │ │ MATIC ┆ MATIC-USD.CC │ │ DYDX ┆ DYDX-USD.CC │ │ … ┆ … │ │ HEMI ┆ HEMI-USD.CC │ │ APEX ┆ APEX-USD.CC │ │ 2Z ┆ 2Z-USD.CC │ │ ZEC ┆ ZEC-USD.CC │ │ MON ┆ MON-USD.CC │ └───────┴──────────────┘ 📊 Dataset will include 216 cryptocurrencies Sample tickers: ['BTC-USD.CC', 'ETH-USD.CC', 'ATOM-USD.CC', 'MATIC-USD.CC', 'DYDX-USD.CC', 'SOL-USD.CC', 'AVAX-USD.CC', 'BNB-USD.CC', 'APE3-USD.CC', 'OP-USD.CC'] Sample IDs: ['BTC', 'ETH', 'ATOM', 'MATIC', 'DYDX', 'SOL', 'AVAX', 'BNB', 'APE', 'OP']
Step 2: Download Historical Data with numerblox¶
from numerblox.download import EODDownloader
# Set date range
start_date = "20200101"
end_date = datetime.now()
# Initialize EOD downloader
eod = EODDownloader(
directory_path="data",
key=EOD_API_KEY,
tickers=eodhd_tickers
)
eod.end_date = end_date
print("Downloading historical data...")
print("This may take a few minutes depending on number of tickers")
# Download data
eod.download_training_data(start=start_date)
# Load the downloaded data
filename = f"data/eod_{start_date}_{end_date.strftime('%Y%m%d')}.parquet"
eod_df = pl.read_parquet(filename)
eod_df = eod_df.with_columns(pl.col("date").str.to_datetime())
# Add clean ID column using Hyperliquid naming
eod_df = eod_df.with_columns(pl.col("ticker").replace(id_mapping).alias("id"))
# Check coverage
requested_tickers = len(eodhd_tickers)
downloaded_tickers = eod_df["ticker"].n_unique()
coverage_pct = (downloaded_tickers / requested_tickers) * 100
print(f"✅ Downloaded {len(eod_df)} rows for {downloaded_tickers} tickers")
print(f"📊 Coverage: {downloaded_tickers}/{requested_tickers} tickers ({coverage_pct:.1f}%)")
print(f"📅 Date range: {eod_df['date'].min()} to {eod_df['date'].max()}")
eod_df.head()
Downloading historical data... This may take a few minutes depending on number of tickers
EOD price data extraction: 0%| | 0/216 [00:00<?, ?it/s]
WARNING: Date pull failed on ticker: 'HPOS-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/HPOS-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'FRIEND-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/FRIEND-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'OX-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/OX-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'SHIA-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/SHIA-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'CANTO-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/CANTO-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'NFTI-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/NFTI-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'ALT29073-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/PANDORA-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101WARNING: Date pull failed on ticker: 'PANDORA-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/PANDORA-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'AI-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/AI-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'TST35647-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/JELLY-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: 'APEX-USD.CC'. Exception: "None of ['date'] are in the columns" WARNING: Date pull failed on ticker: 'HEMI-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/HEMI-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 WARNING: Date pull failed on ticker: '2Z-USD.CC'. Exception: 404 Client Error: Not Found for url: https://eodhistoricaldata.com/api/eod/2Z-USD.CC?period=d&to=2025-10-08+18%3A02%3A25.499812&fmt=json&api_token=YOUR_API_KEY&from=20200101 ✅ Downloaded 219879 rows for 203 tickers 📊 Coverage: 203/216 tickers (94.0%) 📅 Date range: 2020-01-01 00:00:00 to 2025-10-08 00:00:00
| open | high | low | close | adjusted_close | volume | ticker | date | id |
|---|---|---|---|---|---|---|---|---|
| f64 | f64 | f64 | f64 | f64 | i64 | str | datetime[μs] | str |
| 1.796991 | 1.796991 | 1.104477 | 1.331082 | 1.331082 | 2314949442 | "ARB11841-USD.CC" | 2023-03-23 00:00:00 | "ARB" |
| 1.325396 | 1.555872 | 1.18606 | 1.272492 | 1.272492 | 2537709581 | "ARB11841-USD.CC" | 2023-03-24 00:00:00 | "ARB" |
| 1.272393 | 1.307232 | 1.19297 | 1.224705 | 1.224705 | 1294894243 | "ARB11841-USD.CC" | 2023-03-25 00:00:00 | "ARB" |
| 1.224117 | 1.341853 | 1.208092 | 1.283315 | 1.283315 | 1059587959 | "ARB11841-USD.CC" | 2023-03-26 00:00:00 | "ARB" |
| 1.282521 | 1.320275 | 1.124495 | 1.162705 | 1.162705 | 1014240603 | "ARB11841-USD.CC" | 2023-03-27 00:00:00 | "ARB" |
Visualize Raw Price Data¶
Let's look at what the raw downloaded data looks like - this is the "Input" to our feature engineering pipeline.
import altair as alt
alt.data_transformers.enable('vegafusion')
# Select top tickers by data coverage for visualization
top_tickers = (
eod_df.group_by("id")
.agg(pl.col("date").count().alias("count"))
.sort("count", descending=True)
.head(20)["id"]
.to_list()
)
# Create visualization of raw prices
viz_df = eod_df.filter(pl.col("id").is_in(top_tickers)).to_pandas()
chart = (
alt.Chart(viz_df)
.mark_line(opacity=0.6)
.encode(
x=alt.X("date:T", title="Date"),
y=alt.Y("close:Q", title="Close Price (USD)", scale=alt.Scale(type="log")),
color=alt.Color("id:N", title="Ticker", legend=alt.Legend(columns=2)),
tooltip=["id:N", "date:T", "close:Q"]
)
.properties(
width=700,
height=300,
title="Input: Raw Stock Prices Over Time (Log Scale)"
)
)
chart