From 700a6e3441dff4a9394d39304bd2bf452ccb4bc8 Mon Sep 17 00:00:00 2001 From: Jacob Torrey Date: Thu, 15 Jun 2023 10:47:50 -0600 Subject: [PATCH] Add CUDA support for Roberta (local) and fix an alignment issue Signed-off-by: Jacob Torrey --- roberta_detect.py | 13 ++++++++----- roberta_local.py | 29 +++++++++++++++++++---------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/roberta_detect.py b/roberta_detect.py index 2a9084b..3f10e14 100644 --- a/roberta_detect.py +++ b/roberta_detect.py @@ -7,7 +7,7 @@ from typing import Optional, Tuple from roberta_local import classify_text -def run_on_file_chunked(filename : str, chunk_size : int = 1025, fuzziness : int = 3) -> Optional[Tuple[str, float]]: +def run_on_file_chunked(filename : str, chunk_size : int = 800, fuzziness : int = 3) -> Optional[Tuple[str, float]]: ''' Given a filename (and an optional chunk size) returns the score for the contents of that file. This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input @@ -17,7 +17,7 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1025, fuzziness : int contents = fp.read() return run_on_text_chunked(contents, chunk_size, fuzziness) -def run_on_text_chunked(contents : str, chunk_size : int = 1025, fuzziness : int = 3) -> Optional[Tuple[str, float]]: +def run_on_text_chunked(contents : str, chunk_size : int = 800, fuzziness : int = 3) -> Optional[Tuple[str, float]]: ''' Given a text (and an optional chunk size) returns the score for the contents of that string. This function chunks the string into at most chunk_size parts to score separately, then returns an average. This prevents a very large input @@ -35,12 +35,15 @@ def run_on_text_chunked(contents : str, chunk_size : int = 1025, fuzziness : int chunks = [] while start + chunk_size < len(contents) and end != -1: end = contents.rfind(' ', start, start + chunk_size + 1) + if end == -1: + end = contents.rfind('\n', start, start + chunk_size + 1) + if end == -1: + print("Unable to chunk naturally!") + end = start + chunk_size + 1 chunks.append(contents[start:end]) start = end + 1 chunks.append(contents[start:]) - scores = [] - for c in chunks: - scores.append(classify_text(c)) + scores = classify_text(chunks) ssum : float = 0.0 for s in scores: if s[0] == 'AI': diff --git a/roberta_local.py b/roberta_local.py index 4fe53f5..c7dd8ae 100644 --- a/roberta_local.py +++ b/roberta_local.py @@ -1,17 +1,26 @@ #!/usr/bin/env python3 -from transformers import AutoTokenizer, AutoModelForSequenceClassification +from typing import List, Tuple +from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline import torch +if torch.cuda.is_available(): + DEVICE = 'cuda:0' +else: + DEVICE = 'cpu' + tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector") model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector") +pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=DEVICE) + +def classify_text(s : List[str]) -> List[Tuple[str, float]]: + res = pipe(s) + out = [] + for r in res: + label = r['label'] + conf = r['score'] + if label == 'Real': + out.append(('Human', conf)) + out.append(('AI', conf)) + return out -def classify_text(s : str): - inputs = tokenizer(s, return_tensors='pt') - with torch.no_grad(): - logits = model(**inputs).logits - pc = model.config.id2label[logits.argmax().item()] - conf = max(torch.softmax(logits, dim=1).tolist()[0]) - if pc == 'Real': - return ('Human', conf) - return ('AI', conf)