From 700a6e3441dff4a9394d39304bd2bf452ccb4bc8 Mon Sep 17 00:00:00 2001
From: Jacob Torrey <jacob@thinkst.com>
Date: Thu, 15 Jun 2023 10:47:50 -0600
Subject: [PATCH] Add CUDA support for Roberta (local) and fix an alignment
 issue

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
---
 roberta_detect.py | 13 ++++++++-----
 roberta_local.py  | 29 +++++++++++++++++++----------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/roberta_detect.py b/roberta_detect.py
index 2a9084b..3f10e14 100644
--- a/roberta_detect.py
+++ b/roberta_detect.py
@@ -7,7 +7,7 @@ from typing import Optional, Tuple
 
 from roberta_local import classify_text
 
-def run_on_file_chunked(filename : str, chunk_size : int = 1025, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
+def run_on_file_chunked(filename : str, chunk_size : int = 800, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
 	'''
 	Given a filename (and an optional chunk size) returns the score for the contents of that file.
 	This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@@ -17,7 +17,7 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1025, fuzziness : int
 		contents = fp.read()
 	return run_on_text_chunked(contents, chunk_size, fuzziness)
 
-def run_on_text_chunked(contents : str, chunk_size : int = 1025, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
+def run_on_text_chunked(contents : str, chunk_size : int = 800, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
 	'''
 	Given a text (and an optional chunk size) returns the score for the contents of that string.
 	This function chunks the string into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@@ -35,12 +35,15 @@ def run_on_text_chunked(contents : str, chunk_size : int = 1025, fuzziness : int
 	chunks = []
 	while start + chunk_size < len(contents) and end != -1:
 		end = contents.rfind(' ', start, start + chunk_size + 1)
+		if end == -1:
+			end = contents.rfind('\n', start, start + chunk_size + 1)
+		if end == -1:
+			print("Unable to chunk naturally!")
+			end = start + chunk_size + 1
 		chunks.append(contents[start:end])
 		start = end + 1
 	chunks.append(contents[start:])
-	scores = []
-	for c in chunks:
-		scores.append(classify_text(c))
+	scores = classify_text(chunks)
 	ssum : float = 0.0
 	for s in scores:
 		if s[0] == 'AI':
diff --git a/roberta_local.py b/roberta_local.py
index 4fe53f5..c7dd8ae 100644
--- a/roberta_local.py
+++ b/roberta_local.py
@@ -1,17 +1,26 @@
 #!/usr/bin/env python3
 
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from typing import List, Tuple
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import torch
 
+if torch.cuda.is_available():
+    DEVICE = 'cuda:0'
+else:
+    DEVICE = 'cpu'
+
 tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
 model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
+pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=DEVICE)
+
+def classify_text(s : List[str]) -> List[Tuple[str, float]]: 
+    res = pipe(s)
+    out = []
+    for r in res:
+        label = r['label']
+        conf = r['score']
+        if label == 'Real':
+            out.append(('Human', conf))
+        out.append(('AI', conf))
+    return out
 
-def classify_text(s : str):
-    inputs = tokenizer(s, return_tensors='pt')
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    pc = model.config.id2label[logits.argmax().item()]
-    conf = max(torch.softmax(logits, dim=1).tolist()[0])
-    if pc == 'Real':
-        return ('Human', conf)
-    return ('AI', conf)