From 7cade5aed9cb3cc7b5b07322f4e3071cf5933994 Mon Sep 17 00:00:00 2001
From: Alessandro Battilani <alessandro.battilani@gmail.com>
Date: Wed, 30 Oct 2024 22:37:32 +0100
Subject: [PATCH] initial

---
 .gitignore              |  12 ++
 .vscode/settings.json   |   3 +
 extract_ts.py           |  61 ++++++++
 timestamp_extraction.py | 211 ++++++++++++++++++++++++++++
 timestamp_frame.py      |  34 +++++
 timestamps.csv          | 301 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 622 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .vscode/settings.json
 create mode 100644 extract_ts.py
 create mode 100644 timestamp_extraction.py
 create mode 100644 timestamp_frame.py
 create mode 100644 timestamps.csv

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a8510c8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+*.pyc
+*.spec
+*.toc
+*.pkg
+*.zip
+*.html
+*.txt
+*.pyz
+*.pyo
+*.log
+*.lock
+.venv/*
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..35fe170
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "CodeGPT.apiKey": "Ollama"
+}
\ No newline at end of file
diff --git a/extract_ts.py b/extract_ts.py
new file mode 100644
index 0000000..d50bb84
--- /dev/null
+++ b/extract_ts.py
@@ -0,0 +1,61 @@
+import cv2
+import pytesseract
+import re
+
+# Set Tesseract executable path
+pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
+
+def extract_frame_and_timestamp(video_path):
+    """
+    Extract a frame from the given video and identify its timestamp.
+
+    Args:
+        video_path (str): Path to the video file.
+
+    Returns:
+        None if the video cannot be opened or no timestamp is found.
+        Otherwise, prints the extracted timestamp.
+    """
+
+    # Open the video capture
+    cap = cv2.VideoCapture(video_path)
+
+    # Check if the video was opened correctly
+    if not cap.isOpened():
+        print("Error opening the video")
+        return
+
+    # Iterate over frames until one with a timestamp is found
+    while cap.isOpened():
+        ret, frame = cap.read()
+
+        if not ret:
+            break
+
+        text = pytesseract.image_to_string(frame)
+        timestamp = _extract_timestamp(text)
+
+        if timestamp:
+            print(timestamp.group())
+            break
+
+    # Release the video capture
+    cap.release()
+
+def _extract_timestamp(text):
+    """
+    Extract a timestamp from the given text.
+
+    Args:
+        text (str): Text to search for timestamps.
+
+    Returns:
+        The extracted timestamp as a string, or None if no timestamp is found.
+    """
+
+    pattern = r'\d{2}-\d{2}-\d{4}\s\d{2}:\d{2}:\d{2}'
+    return re.search(pattern, text)
+
+# Execute the program
+video_path = "/home/alex/Scaricati/2024_08_11_14_55_26.MP4"
+extract_frame_and_timestamp(video_path)
diff --git a/timestamp_extraction.py b/timestamp_extraction.py
new file mode 100644
index 0000000..fa64367
--- /dev/null
+++ b/timestamp_extraction.py
@@ -0,0 +1,211 @@
+# program to extract a timestamp from a video
+# written by Akshay Krishnan, Yashika (Interns, Amagi Media Labs)
+
+#usage:
+#timestamp_extract.py -b <Name_of_s3_bucket> -p <path_to_folder_containing_videos_in_bucket> -f(optional)
+#specifying f is optional. if specified, it downloads entire video from the bucket.
+#before running, ensure that a file 'videos.txt' contains the list of videos in s3 from which timestamp is to be extracted
+#after execution, extracted timestamps are stored in stamp_file.txt
+
+import cv2				#computer vision library to process images
+from PIL import Image 	#Python imaging library to handle images
+import pytesseract		#Google's tesseract-OCR wrapper for python
+import argparse 		#library to handle command line arguments
+import os 				#perform file operations, create and remove files
+import re 				#perform regular expression search
+import unicodedata		#convert string format between unicode and ascii
+import boto3 			#downloading files 
+import botocore 		#from AWS s3
+
+
+#function to convert the numbers in the stamp as detected by tesseract
+#into the standard hh:mm:ss:nn stamp format
+def stampFromText(text):
+	stamp =  text.encode('ascii','ignore')
+	s = list(stamp)
+	st = []
+	for item in s:
+		if item.isdigit() == True:
+			st.append(item)
+	if len(st) == 8:
+		return (True, st[0]+st[1]+':'+st[2]+st[3]+':'+st[4]+st[5]+':'+st[6]+st[7])
+	else:
+		return 	(False, 'garbage')
+
+
+#function to calculate the mismatch between stamp of first frame
+#as obtained by frame number and the stamp as obtained by tesseract
+def calculateMismatch(stamp, firstFrameStamp, frame_no):
+	h, m, s, f = firstFrameStamp
+	h1, m1, s1, f1 = stamp
+	init_f = (h*3600 + m*60 + s)*25 + f+1
+	now_f = (h1*3600 +m1*60+s1)*25 +f1 +1
+	return frame_no - (now_f - init_f)
+
+
+#function that obtains the timestamp in a given frame of the video
+def GetTimestamp(video):
+	cap = cv2.VideoCapture(video)
+	filename = "timestamp_test_file.bmp"
+	stamp_found = False
+	frame_count = 0
+	stamp_confirmed = False
+	stamp = None
+	firstFrameStamp1 = None
+
+	reset_count = 0
+	mismatch = 0
+	while(cap.isOpened() and stamp_confirmed is False):
+		ret, frame = cap.read()
+		if ret == True:
+			cv2.imwrite(filename, frame)
+			text = pytesseract.image_to_string(Image.open(filename))
+			if len(text) > 0:
+				m = re.findall("[0-2][0-9].[0-5][0-9].[0-5][0-9].[0-9][0-9]", text)
+				if len(m) > 0:
+					temp_flag, stamp = stampFromText(m[0])
+					if temp_flag is True:
+						firstFrameStamp = getTimeFirstFrame(stamp, frame_count, 25)
+						if stamp_found is False:
+							stamp_found = True
+							firstFrameStamp1 = firstFrameStamp
+							
+						else:
+							x,y,w,z = firstFrameStamp1
+							x1,y1,w1,z1 = firstFrameStamp
+							if (x1==x and y1==y and z1==z and w1==w):
+								stamp_confirmed = True
+								mismatch = calculateMismatch(intStamp(stamp), firstFrameStamp, frame_count)
+							else:
+								firstFrameStamp1 = firstFrameStamp
+								reset_count+=1
+				else:
+					gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+					rows, cols =  gray.shape
+					gray = gray[20:40, int(float(cols/3))+18:int(float(cols)*2/3)-16]
+					gray = cv2.threshold(gray, 240,255,cv2.THRESH_BINARY)[1]
+					gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
+					gray = cv2.dilate(gray, cv2.getStructuringElement(cv2.MORPH_CROSS, (3,3)), iterations=1)
+					cv2.imwrite(filename, gray)
+					text = pytesseract.image_to_string(Image.open(filename))
+					os.remove(filename)
+					if len(text) > 0:
+						m = re.findall("[0-2][0-9].[0-5][0-9].[0-5][0-9].[0-9][0-9]", text)
+						if len(m) > 0:
+							temp_flag, stamp = stampFromText(m[0])
+							if temp_flag is True:
+								firstFrameStamp = getTimeFirstFrame(stamp, frame_count, 25)
+								if stamp_found is False:
+									stamp_found = True
+									firstFrameStamp1 = firstFrameStamp
+										
+								else:
+									x,y,w,z = firstFrameStamp1
+									x1,y1,w1,z1 = firstFrameStamp
+									if (x1==x and y1==y and z1==z and w1==w):
+										stamp_confirmed = True
+										mismatch = calculateMismatch(intStamp(stamp), firstFrameStamp, frame_count)
+									else:
+										firstFrameStamp1 = firstFrameStamp
+										reset_count+=1
+			frame_count += 1		
+		else:
+			break
+	if(os.path.isfile('timestamp_test_file.bmp')):
+		os.remove('timestamp_test_file.bmp')
+	cap.release()
+	return stamp_confirmed, firstFrameStamp1, mismatch
+
+
+#if frame number is given as input, the function returns the run time to reach that particular frame
+def getTimeFromFrameNumber(frame_no, fps):
+	f = frame_no % fps
+	s = int(frame_no/fps)
+	m = int(s/60)
+	s = s%60
+	h = int(m/60)
+	m = m%60
+	return (h, m, s, f)
+
+
+#helper function to convert stamps from string to integer tuples
+def intStamp(stamp):
+	l = stamp.split(':')
+	t = []
+	for i in l:
+		t.append(int(i))
+	return t
+
+
+#funtion to calculate the timestamp on the first frame if the stamp at 
+#any other frame and the corresponding frame numbers are input
+def getTimeFirstFrame(stamp, frameno, fps):
+	t = intStamp(stamp)
+	h,m,s,f = t
+	secs = h*3600 + m*60 + s
+	isecs = secs - (frameno/fps)
+	h = isecs/3600
+	m = (isecs%3600)/60
+	s = (isecs%60)
+	return (h,m,s,0)
+
+
+
+if __name__ == '__main__':
+	ap = argparse.ArgumentParser()
+	ap.add_argument("-b", "--bucket", required=True, help="S3 bucket name")
+	ap.add_argument("-p", "--path", required=True, help="path to folder containing videos in bucket")
+	ap.add_argument("-f", "--full", default=False, help="boolean flag, if set downloads entire media file", action='store_true')
+
+	args = vars(ap.parse_args())
+	s3 = boto3.resource('s3')
+	client = boto3.client('s3')
+	offset = 0
+	end = 5000000
+
+	if(os.path.isfile('stamp_file.txt')):
+		db_file = open("stamp_file.txt", 'r')
+		text_from_file = str(db_file.read())
+		db_file.close()
+	else:
+		text_from_file = ''
+	videos_file = open("videos.txt")
+	videoname = ''
+	stamp_file = open("stamp_file.txt", 'a+')
+	lines = videos_file.read().split('\n')
+	try:
+		for line in lines:
+			if len(line) > 0:
+				words = line.split(' ')
+				videoname = words[len(words)-1]
+				if videoname in text_from_file:
+					print videoname, ": entry already exists for this video"
+				else:
+					if args["full"] is True:
+						s3.Bucket(args["bucket"]).download_file(args["path"]+videoname, videoname)
+					else:
+						obj = client.get_object(Bucket=args["bucket"], Key=args["path"]+videoname,Range='bytes={}-{}'.format(offset, end))
+						newdata = obj['Body'].read()
+						f = open(videoname,'w')
+						f.write(newdata)
+						f.close()
+					
+					flagt, stamp, err = GetTimestamp(videoname)
+					os.remove(videoname)
+					if flagt == True:
+						stamp_file.write(videoname+', '+str(stamp[0])+":"+str(stamp[1])+":"+str(stamp[2])+":"+str(stamp[3])+', '+str(err)+'\n')
+						print "video ", videoname, stamp, err
+
+					else:
+						print videoname, ": error in finding stamp"
+		stamp_file.close()
+		videos_file.close()
+	except KeyboardInterrupt:
+		if(os.path.isfile('timestamp_test_file.bmp')):
+			os.remove('timestamp_test_file.bmp')
+		if(os.path.isfile(videoname)):
+			os.remove(videoname)
+		stamp_file.close()
+		videos_file.close()
+		print "quitting"
+
diff --git a/timestamp_frame.py b/timestamp_frame.py
new file mode 100644
index 0000000..dcaf9c7
--- /dev/null
+++ b/timestamp_frame.py
@@ -0,0 +1,34 @@
+import cv2
+import pytesseract
+import re
+
+pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
+
+def salva_frame(video_path):
+    # Apri il video
+    cap = cv2.VideoCapture(video_path)
+
+    # Controlla se il video è stato aperto correttamente
+    if not cap.isOpened():
+        print("Errore nell'apertura del video")
+        return
+
+    while(cap.isOpened()):
+        # Leggi un frame
+        ret, frame = cap.read()
+
+        if not ret:
+            break
+
+        text = pytesseract.image_to_string(frame)
+        timestamp = re.search(r'\d{2}-\d{2}-\d{4}\s\d{2}:\d{2}:\d{2}', text)
+        if timestamp:
+            print(timestamp.group())
+            break
+
+    # Rilascia il video capture
+    cap.release()
+
+# Esegui il programma
+video_path = "/home/alex/Scaricati/2024_08_11_14_55_26.MP4"  # Sostituisci con il percorso del tuo video
+salva_frame(video_path)
\ No newline at end of file
diff --git a/timestamps.csv b/timestamps.csv
new file mode 100644
index 0000000..4a7ef6a
--- /dev/null
+++ b/timestamps.csv
@@ -0,0 +1,301 @@
+Frame,Timestamp
+0,0.0
+1,0.03333333333333333
+2,0.06666666666666667
+3,0.1
+4,0.13333333333333333
+5,0.16666666666666666
+6,0.2
+7,0.23333333333333334
+8,0.26666666666666666
+9,0.3
+10,0.3333333333333333
+11,0.36666666666666664
+12,0.4
+13,0.43333333333333335
+14,0.4666666666666667
+15,0.5
+16,0.5333333333333333
+17,0.5666666666666667
+18,0.6
+19,0.6333333333333333
+20,0.6666666666666666
+21,0.7
+22,0.7333333333333333
+23,0.7666666666666666
+24,0.8
+25,0.8333333333333334
+26,0.8666666666666667
+27,0.9
+28,0.9333333333333333
+29,0.9666666666666667
+30,1.0
+31,1.0333333333333332
+32,1.0666666666666667
+33,1.1
+34,1.1333333333333333
+35,1.1666666666666667
+36,1.2
+37,1.2333333333333334
+38,1.2666666666666666
+39,1.3
+40,1.3333333333333333
+41,1.3666666666666667
+42,1.4
+43,1.4333333333333333
+44,1.4666666666666666
+45,1.5
+46,1.5333333333333332
+47,1.5666666666666667
+48,1.6
+49,1.6333333333333333
+50,1.6666666666666667
+51,1.7
+52,1.7333333333333334
+53,1.7666666666666666
+54,1.8
+55,1.8333333333333333
+56,1.8666666666666667
+57,1.9
+58,1.9333333333333333
+59,1.9666666666666666
+60,2.0
+61,2.033333333333333
+62,2.0666666666666664
+63,2.1
+64,2.1333333333333333
+65,2.1666666666666665
+66,2.2
+67,2.2333333333333334
+68,2.2666666666666666
+69,2.3
+70,2.3333333333333335
+71,2.3666666666666667
+72,2.4
+73,2.433333333333333
+74,2.466666666666667
+75,2.5
+76,2.533333333333333
+77,2.5666666666666664
+78,2.6
+79,2.6333333333333333
+80,2.6666666666666665
+81,2.7
+82,2.7333333333333334
+83,2.7666666666666666
+84,2.8
+85,2.8333333333333335
+86,2.8666666666666667
+87,2.9
+88,2.933333333333333
+89,2.966666666666667
+90,3.0
+91,3.033333333333333
+92,3.0666666666666664
+93,3.1
+94,3.1333333333333333
+95,3.1666666666666665
+96,3.2
+97,3.2333333333333334
+98,3.2666666666666666
+99,3.3
+100,3.3333333333333335
+101,3.3666666666666667
+102,3.4
+103,3.433333333333333
+104,3.466666666666667
+105,3.5
+106,3.533333333333333
+107,3.5666666666666664
+108,3.6
+109,3.6333333333333333
+110,3.6666666666666665
+111,3.6999999999999997
+112,3.7333333333333334
+113,3.7666666666666666
+114,3.8
+115,3.8333333333333335
+116,3.8666666666666667
+117,3.9
+118,3.933333333333333
+119,3.966666666666667
+120,4.0
+121,4.033333333333333
+122,4.066666666666666
+123,4.1
+124,4.133333333333333
+125,4.166666666666667
+126,4.2
+127,4.233333333333333
+128,4.266666666666667
+129,4.3
+130,4.333333333333333
+131,4.366666666666666
+132,4.4
+133,4.433333333333334
+134,4.466666666666667
+135,4.5
+136,4.533333333333333
+137,4.566666666666666
+138,4.6
+139,4.633333333333333
+140,4.666666666666667
+141,4.7
+142,4.733333333333333
+143,4.766666666666667
+144,4.8
+145,4.833333333333333
+146,4.866666666666666
+147,4.9
+148,4.933333333333334
+149,4.966666666666667
+150,5.0
+151,5.033333333333333
+152,5.066666666666666
+153,5.1
+154,5.133333333333333
+155,5.166666666666667
+156,5.2
+157,5.233333333333333
+158,5.266666666666667
+159,5.3
+160,5.333333333333333
+161,5.366666666666666
+162,5.4
+163,5.433333333333334
+164,5.466666666666667
+165,5.5
+166,5.533333333333333
+167,5.566666666666666
+168,5.6
+169,5.633333333333333
+170,5.666666666666667
+171,5.7
+172,5.733333333333333
+173,5.766666666666667
+174,5.8
+175,5.833333333333333
+176,5.866666666666666
+177,5.9
+178,5.933333333333334
+179,5.966666666666667
+180,6.0
+181,6.033333333333333
+182,6.066666666666666
+183,6.1
+184,6.133333333333333
+185,6.166666666666667
+186,6.2
+187,6.233333333333333
+188,6.266666666666667
+189,6.3
+190,6.333333333333333
+191,6.366666666666666
+192,6.4
+193,6.433333333333334
+194,6.466666666666667
+195,6.5
+196,6.533333333333333
+197,6.566666666666666
+198,6.6
+199,6.633333333333333
+200,6.666666666666667
+201,6.7
+202,6.733333333333333
+203,6.766666666666667
+204,6.8
+205,6.833333333333333
+206,6.866666666666666
+207,6.8999999999999995
+208,6.933333333333334
+209,6.966666666666667
+210,7.0
+211,7.033333333333333
+212,7.066666666666666
+213,7.1
+214,7.133333333333333
+215,7.166666666666667
+216,7.2
+217,7.233333333333333
+218,7.266666666666667
+219,7.3
+220,7.333333333333333
+221,7.366666666666666
+222,7.3999999999999995
+223,7.433333333333334
+224,7.466666666666667
+225,7.5
+226,7.533333333333333
+227,7.566666666666666
+228,7.6
+229,7.633333333333333
+230,7.666666666666667
+231,7.7
+232,7.733333333333333
+233,7.766666666666667
+234,7.8
+235,7.833333333333333
+236,7.866666666666666
+237,7.8999999999999995
+238,7.933333333333334
+239,7.966666666666667
+240,8.0
+241,8.033333333333333
+242,8.066666666666666
+243,8.1
+244,8.133333333333333
+245,8.166666666666666
+246,8.2
+247,8.233333333333333
+248,8.266666666666666
+249,8.3
+250,8.333333333333334
+251,8.366666666666667
+252,8.4
+253,8.433333333333334
+254,8.466666666666667
+255,8.5
+256,8.533333333333333
+257,8.566666666666666
+258,8.6
+259,8.633333333333333
+260,8.666666666666666
+261,8.7
+262,8.733333333333333
+263,8.766666666666666
+264,8.8
+265,8.833333333333334
+266,8.866666666666667
+267,8.9
+268,8.933333333333334
+269,8.966666666666667
+270,9.0
+271,9.033333333333333
+272,9.066666666666666
+273,9.1
+274,9.133333333333333
+275,9.166666666666666
+276,9.2
+277,9.233333333333333
+278,9.266666666666666
+279,9.3
+280,9.333333333333334
+281,9.366666666666667
+282,9.4
+283,9.433333333333334
+284,9.466666666666667
+285,9.5
+286,9.533333333333333
+287,9.566666666666666
+288,9.6
+289,9.633333333333333
+290,9.666666666666666
+291,9.7
+292,9.733333333333333
+293,9.766666666666666
+294,9.8
+295,9.833333333333334
+296,9.866666666666667
+297,9.9
+298,9.933333333333334
+299,9.966666666666667