diff --git a/search_index.pickle b/search_index.pickle new file mode 100644 index 0000000..26e0883 Binary files /dev/null and b/search_index.pickle differ diff --git a/searchindexer.ipynb b/searchindexer.ipynb new file mode 100644 index 0000000..30d4995 --- /dev/null +++ b/searchindexer.ipynb @@ -0,0 +1,276 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.9.0 64-bit", + "metadata": { + "interpreter": { + "hash": "36cf16204b8548560b1c020c4e8fb5b57f0e4c58016f52f2d4be01e192833930" + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: tqdm in /home/anson/.local/lib/python3.8/site-packages (4.59.0)\n" + ] + } + ], + "source": [ + "!pip install tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "import requests as r\n", + "import pandas as pd\n", + "from fuzzywuzzy import fuzz\n", + "from functools import cache\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + " def stocks():\n", + "\n", + " raw_symbols = r.get(\n", + " f\"https://cloud.iexapis.com/stable/ref-data/symbols?token=sk_b3323ec3072e44c5acc414868bdd40ce\"\n", + " ).json()\n", + " symbols = pd.DataFrame(data=raw_symbols)\n", + "\n", + " symbols[\"description\"] = \"$\" + symbols[\"symbol\"] + \": \" + symbols[\"name\"]\n", + " symbols[\"id\"] = symbols[\"symbol\"]\n", + "\n", + " symbols = symbols[[\"id\", \"symbol\", \"name\", \"description\"]]\n", + "\n", + " return symbols\n", + "\n", + "\n", + "\n", + " def coins():\n", + "\n", + " raw_symbols = r.get(\"https://api.coingecko.com/api/v3/coins/list\").json()\n", + " symbols = pd.DataFrame(data=raw_symbols)\n", + "\n", + " symbols[\"description\"] = \"$$\" + symbols[\"symbol\"] + \": \" + symbols[\"name\"]\n", + " symbols = symbols[[\"id\", \"symbol\", \"name\", \"description\"]]\n", + "\n", + " return symbols" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id symbol \\\n", + "0 A A \n", + "1 AA AA \n", + "2 AAA AAA \n", + "3 AAAU AAAU \n", + "4 AAC AAC \n", + "... ... ... \n", + "6565 zyro zyro \n", + "6566 zytara-dollar zusd \n", + "6567 zyx zyx \n", + "6568 zzz-finance zzz \n", + "6569 zzz-finance-v2 zzzv2 \n", + "\n", + " name \\\n", + "0 Agilent Technologies Inc. \n", + "1 Alcoa Corp \n", + "2 Listed Funds Trust - AAF First Priority CLO Bo... \n", + "3 Goldman Sachs Physical Gold ETF Shares - Goldm... \n", + "4 Ares Acquisition Corporation - Class A \n", + "... ... \n", + "6565 Zyro \n", + "6566 Zytara Dollar \n", + "6567 ZYX \n", + "6568 zzz.finance \n", + "6569 zzz.finance v2 \n", + "\n", + " description \n", + "0 $A: Agilent Technologies Inc. \n", + "1 $AA: Alcoa Corp \n", + "2 $AAA: Listed Funds Trust - AAF First Priority ... \n", + "3 $AAAU: Goldman Sachs Physical Gold ETF Shares ... \n", + "4 $AAC: Ares Acquisition Corporation - Class A \n", + "... ... \n", + "6565 $$zyro: Zyro \n", + "6566 $$zusd: Zytara Dollar \n", + "6567 $$zyx: ZYX \n", + "6568 $$zzz: zzz.finance \n", + "6569 $$zzzv2: zzz.finance v2 \n", + "\n", + "[16946 rows x 4 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idsymbolnamedescription
0AAAgilent Technologies Inc.$A: Agilent Technologies Inc.
1AAAAAlcoa Corp$AA: Alcoa Corp
2AAAAAAListed Funds Trust - AAF First Priority CLO Bo...$AAA: Listed Funds Trust - AAF First Priority ...
3AAAUAAAUGoldman Sachs Physical Gold ETF Shares - Goldm...$AAAU: Goldman Sachs Physical Gold ETF Shares ...
4AACAACAres Acquisition Corporation - Class A$AAC: Ares Acquisition Corporation - Class A
...............
6565zyrozyroZyro$$zyro: Zyro
6566zytara-dollarzusdZytara Dollar$$zusd: Zytara Dollar
6567zyxzyxZYX$$zyx: ZYX
6568zzz-financezzzzzz.finance$$zzz: zzz.finance
6569zzz-finance-v2zzzv2zzz.finance v2$$zzzv2: zzz.finance v2
\n

16946 rows × 4 columns

\n
" + }, + "metadata": {}, + "execution_count": 51 + } + ], + "source": [ + "df = pd.concat([stocks(), coins()])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + " def search_symbols(search: str):\n", + " \"\"\"Performs a fuzzy search to find stock symbols closest to a search term.\n", + "\n", + " Parameters\n", + " ----------\n", + " search : str\n", + " String used to search, could be a company name or something close to the companies stock ticker.\n", + "\n", + " Returns\n", + " -------\n", + " List[tuple[str, str]]\n", + " A list tuples of every stock sorted in order of how well they match. Each tuple contains: (Symbol, Issue Name).\n", + " \"\"\"\n", + "\n", + " try:\n", + " if search_index[search]: return search_index[search]\n", + " except KeyError:\n", + " pass\n", + "\n", + "\n", + "\n", + " search = search.lower()\n", + "\n", + " df[\"Match\"] = df.apply(\n", + " lambda x: fuzz.ratio(search, f\"{x['symbol']}\".lower()),\n", + " axis=1,\n", + " )\n", + "\n", + " df.sort_values(by=\"Match\", ascending=False, inplace=True)\n", + " if df[\"Match\"].head().sum() < 300:\n", + " df[\"Match\"] = df.apply(\n", + " lambda x: fuzz.partial_ratio(search, x[\"name\"].lower()),\n", + " axis=1,\n", + " )\n", + "\n", + " df.sort_values(by=\"Match\", ascending=False, inplace=True)\n", + "\n", + " symbols = df.head(20)\n", + " symbol_list = list(zip(list(symbols[\"symbol\"]), list(symbols[\"description\"])))\n", + " \n", + " return symbol_list" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "search_list = df['id'].to_list() + df['description'].to_list()\n", + "search_index = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + " 5%|▍ | 1545/33892 [06:51<2:23:40, 3.75it/s]\n" + ] + }, + { + "output_type": "error", + "ename": "KeyboardInterrupt", + "evalue": "", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msearch_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msearch_index\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msearch_symbols\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36msearch_symbols\u001b[0;34m(search)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0msearch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msearch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m df[\"Match\"] = df.apply(\n\u001b[0m\u001b[1;32m 25\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfuzz\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"{x['symbol']}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, raw, result_type, args, **kwds)\u001b[0m\n\u001b[1;32m 7763\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7764\u001b[0m )\n\u001b[0;32m-> 7765\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7766\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7767\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_action\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_raw\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_empty_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mres_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_series_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;31m# wrap results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_series_generator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 287\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0moption_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"mode.chained_assignment\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 288\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 289\u001b[0m \u001b[0;31m# ignore SettingWithCopy here in case the user mutates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mseries_generator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0;31m# GH#35462 re-pin mgr in case setitem changed it\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 410\u001b[0;31m \u001b[0mser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_mgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmgr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 411\u001b[0m \u001b[0mblk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0mser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.9/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__setattr__\u001b[0;34m(self, name, value)\u001b[0m\n\u001b[1;32m 5473\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5474\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5475\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5476\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5477\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "\n", + "for i in tqdm(search_list):\n", + " search_index[i] = search_symbols(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "\n", + "\n", + "with open('search_index.pickle', 'wb') as handle:\n", + " pickle.dump(search_index, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", + "\n", + "# with open('filename.pickle', 'rb') as handle:\n", + "# b = pickle.load(handle)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file