From 951a38222826078667eb37f055081c84a9b491d5 Mon Sep 17 00:00:00 2001 From: Anson Date: Sat, 23 May 2020 07:58:07 -0700 Subject: [PATCH] init commit --- .gitignore | 2 + book.ipynb | 381 +++++++++++++++++++++++++++++++++++++++++++++++ grad.py | 43 ++++++ out/out.feather | Bin 0 -> 4666 bytes requirements.txt | 4 + 5 files changed, 430 insertions(+) create mode 100644 .gitignore create mode 100644 book.ipynb create mode 100644 grad.py create mode 100644 out/out.feather create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6c9964a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.jpg +*.mp4 \ No newline at end of file diff --git a/book.ipynb b/book.ipynb new file mode 100644 index 0000000..b5181a1 --- /dev/null +++ b/book.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "from multiprocessing import Pool" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "vidcap = cv2.VideoCapture('C:\\Coding\\grad\\Desert Ridge High School 2020 Graduation-Ixx3uZJM_MI.mp4')\n", + "frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "TypeError", + "evalue": "VideoCapture.get() missing required argument 'propId' (pos 1)", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mvidcap\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgrab\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mvidcap\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m: VideoCapture.get() missing required argument 'propId' (pos 1)" + ] + } + ], + "source": [ + "vidcap.grab()\n", + "vidcap.get()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def writeim(frame):\n", + " success, image = vidcap.read()\n", + " if success:\n", + " cv2.imwrite(f\"C:/Coding/grad/frames2/{frame}.jpg\", image)\n", + " else:\n", + " print(f\"{frame} Failed\")\n", + " return frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "pool = Pool()\n", + "\n", + "pool.map(writeim, range(frames))\n", + "pool.close()\n", + "pool.join()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "8.67 ms ± 179 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + } + ], + "source": [ + "success, image = vidcap.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "KeyboardInterrupt", + "evalue": "", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[0msuccess\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;31m# cv2.imwrite(f\"C:/Coding/grad/frames/frame{count}.jpg\", image)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0msuccess\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mimage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvidcap\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;31m# count+=1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "count = 0\n", + "success, image = vidcap.read()\n", + "while success:\n", + " cv2.imwrite(f\"C:/Coding/grad/frames/frame{count}.jpg\", image)\n", + " success, image = vidcap.read()\n", + " count+=1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "with Pool(12) as p:\n", + " p.map(writeim,range(frames))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "writeim(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "12" + }, + "metadata": {}, + "execution_count": 13 + } + ], + "source": [ + "os.cpu_count()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "FileNotFoundError", + "evalue": "[WinError 2] The system cannot find the file specified", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mcmd\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m\"'C:/Program Files/Tesseract-OCR/tesseract.exe'\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'C:/Coding/grad/218371.jpg'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'C:/Coding/grad/text'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'-l'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'eng'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0msubprocess\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcmd\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32mC:\\Python38\\lib\\subprocess.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[0;32m 487\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'stderr'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPIPE\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 488\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 489\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mPopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mpopenargs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mprocess\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 490\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 491\u001b[0m \u001b[0mstdout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstderr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Python38\\lib\\subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)\u001b[0m\n\u001b[0;32m 852\u001b[0m encoding=encoding, errors=errors)\n\u001b[0;32m 853\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 854\u001b[1;33m self._execute_child(args, executable, preexec_fn, close_fds,\n\u001b[0m\u001b[0;32m 855\u001b[0m \u001b[0mpass_fds\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcwd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0menv\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 856\u001b[0m \u001b[0mstartupinfo\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcreationflags\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mshell\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Python38\\lib\\subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[1;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)\u001b[0m\n\u001b[0;32m 1305\u001b[0m \u001b[1;31m# Start the process\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1306\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1307\u001b[1;33m hp, ht, pid, tid = _winapi.CreateProcess(executable, args,\n\u001b[0m\u001b[0;32m 1308\u001b[0m \u001b[1;31m# no special security\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1309\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [WinError 2] The system cannot find the file specified" + ] + } + ], + "source": [ + "cmd = [\"'C:/Program Files/Tesseract-OCR/tesseract.exe'\", 'C:/Coding/grad/218371.jpg', 'C:/Coding/grad/text', '-l', 'eng']\n", + "subprocess.run(cmd)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "[\"'C:\\\\Program\",\n \"Files\\\\Tesseract-OCR\\\\tesseract.exe'\",\n 'C:\\\\Coding\\\\grad\\\\218371.jpg',\n 'C:\\\\Coding\\\\grad\\\\text',\n '-l',\n 'eng']" + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "\"'C:\\\\Program Files\\\\Tesseract-OCR\\\\tesseract.exe' C:\\\\Coding\\grad\\\\218371.jpg C:\\\\Coding\\\\grad\\\\text -l eng\".split()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "1" + }, + "metadata": {}, + "execution_count": 19 + } + ], + "source": [ + "import os\n", + "\n", + "os.system(\"'C:\\\\Program Files\\\\Tesseract-OCR\\\\tesseract.exe' C:\\\\Coding\\\\grad\\\\218371.jpg C:\\\\Coding\\\\grad\\\\text2.txt -l eng\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "t = 830\n", + "r = 1400\n", + "b = 945\n", + "l = 530\n", + "\n", + "crop_coords = (l ,t, r, b)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "im = \"C:\\\\Coding\\\\grad\\\\218344.jpg\"\n", + "im = Image.open(im)\n", + "im = im.crop(crop_coords)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "import pytesseract\n", + "pytesseract.pytesseract.tesseract_cmd = r'C:\\\\Program Files\\\\Tesseract-OCR\\\\tesseract.exe'" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from multiprocessing import Pool\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "def im_str(im_name):\n", + " im = f\"C:\\\\Coding\\\\grad\\\\frames2\\\\{im_name}.jpg\"\n", + " im = Image.open(im)\n", + " im = im.crop(crop_coords)\n", + " print(im_name, pytesseract.image_to_string(im))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "ims = range(312676,312676+10000)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "p.map(im_str,ims)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def f(x):\n", + " return x*x\n", + "\n", + "\n", + "with Pool(5) as p:\n", + " print(p.map(f, [1, 2, 3]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.1-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python38164bit5615766555ed4dbe9b9be48cfc976e42", + "display_name": "Python 3.8.1 64-bit" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/grad.py b/grad.py new file mode 100644 index 0000000..461cf37 --- /dev/null +++ b/grad.py @@ -0,0 +1,43 @@ +from PIL import Image +import pytesseract +import pandas as pd + +import swifter +import glob + +pytesseract.pytesseract.tesseract_cmd = ( + r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe" +) + +# Coords of where names appear on screen +t = 830 +r = 1400 +b = 945 +l = 530 +crop_coords = (l, t, r, b) + +frames_path = "C:\\Coding\\grad\\frames2" +f_paths = glob.glob(frames_path + "\\*jpg") +print("Frames loaded:", len(f_paths)) +# Make dataframe with frame number as index and frame_path as a column +df = pd.DataFrame( + {"frame_path": f_paths}, + index=[int(f_path.split("\\")[-1][:-4]) - 1 for f_path in f_paths], +) + + +df = df.reset_index(drop=True) + + +def im_str(im_path): + # im = f"C:\\Coding\\grad\\frames2\\{im_name}.jpg" + im = Image.open(im_path) + im = im.crop(crop_coords) + return pytesseract.image_to_string(im) + + +df["text"] = df["frame_path"].swifter.apply(im_str) + +print(df) + +df.to_feather("C:\\Coding\\grad\\out\\full.feather") diff --git a/out/out.feather b/out/out.feather new file mode 100644 index 0000000000000000000000000000000000000000..3f856f90aefdb65afd12d7c659ed66716862b808 GIT binary patch literal 4666 zcmeI0O=u%!7{}i>{fKLn2zBX6;;<}LN|=v+*edMSy@ zyMEpjJh;am_Rxi1JPLZ~Q4bzOJcxL34}yBzJuIvTq5FT{d1m_seURcUZ~4uC-uHbb z@8mzvOy=p#+}zE1OQ{322fa#V6}1cMSv8>gRT_I}J*g+-8Fc%2Jd5}~ng%qU#+K&N zdO!ZG0qi$izv4EP`VD~{qv}_@&0;yIwd#J;n9eOMj^>QGThw#jiBa9{7`|Kg5FQ_I z;FzVVSF03H4ach$qjrPCqlqyJ_gnQw`}A<#i`FUV$qggEe93v^s+%|)hkCde%88?x2a=}f5T+-{A+2)L(h$Z5MR4v)T*V>4cp{y zqwR)aut8_I?uE^2;M1s?wVd;Z3@{*3DmQfcu^E3L>RVt_fNIM3<4b^x982uK>Q zmvoQmRk~Tw5{-L@{aqgTo8w04gwe8tu_oK3pTvK|^KZ~a;6}Sg&x?ko{(=*`S72Ra zeTTKk`Yx->T4F7;R#-jOCDvuu71k>2D(f0+jkV6|vj(gU*7sOL)+TGj+G1U2-C*5h zZL@B%c39tM-DbVT`T^^QtRJy{%t{`=2wn$e@F}i504SoTSz%VZA6nGnSzz*04zk>si!{wU+CGZK@1>b=`!60%e$){`6g=;~j z>Ms|TL$^}Uw^!3C*puV-l$Bp?ET0u%O974)FeU|zO92y7z@!xLiWD$)c7S7=Qh?y3 zV+u|>rr@Mw3Qjtv;G|;;PCBOGq+<$BI;P~LB{^wHPFj+amgJ-*IcZ5wT9T9a5k2$H zaxBS7OLEedoU|n;ZOKVna?+NZv?V8P$w^ys(w3aGB_|#2Bp%u7dGIp00^R~ua0`41 z_P{>)8PN0nU+^5B0~_3w!}~!MA{(_kVzYz%zKR8(3eWi$fCG}>`|Eg94Ff+n@3ws}H|v$xR@{1buxm&+8UE9aK=lA_Vf{{@K~*q- z>LG(3)h|@xyoN2kL*sQy*S)&v!G0F^v2IW51n#3==+?|!x*ryCR9~LIjy+ndC)IJc SX4bj+f7Q%Xf6VCr89fFDH@7|j literal 0 HcmV?d00001 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a26ecf3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +swifter==0.304 +pytesseract==0.3.4 +pandas==1.0.3 +Pillow==7.1.2 \ No newline at end of file