Added OpenAI connector and pytest harness

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
2023-05-15 13:37:16 -06:00 · 2023-05-15 13:37:16 -06:00 · a9be80e94b
commit a9be80e94b
--- a/openai_detect.py
+++ b/openai_detect.py
@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+import os, requests
+from typing import Optional, Dict, Tuple
+
+MODEL_NAME = 'model-detect-v2'
+API_KEY = os.getenv('OPENAI_API_KEY')
+API_URL = 'https://api.openai.com/v1/completions'
+
+def make_req(text : str) -> Optional[Dict]:
+    if len(text) < 1000:
+        print("Input too short for OpenAI to classify")
+        return None
+    headers = {
+        'authorization': 'Bearer ' + API_KEY,
+        'origin': 'https://platform.openai.com',
+        'openai-organization': 'org-gxAZne8U4jJ8pb632XJBLH1i'
+    }
+    data = {
+        'prompt': text + '<disc_score|>',
+        'max_tokens': 1,
+        'temperature': 1,
+        'top_p': 1,
+        'n': 1,
+        'model': MODEL_NAME,
+        'stream': False,
+        'stop': '\\n',
+        'logprobs': 5
+    }
+    res = requests.post(API_URL, headers=headers, json=data)
+    return res.json().get('choices', [None])[0]
+
+def run_on_file(fn : str) -> Optional[Tuple[str, float]]:
+    with open(fn, 'r') as fp:
+        contents = fp.read()
+    res = make_req(contents)
+    if res is None:
+        print("Unable to classify!")
+        return None
+    else:
+        #print(res)
+        if res.get('text') == '"':
+            return ('AI', abs(res.get('logprobs').get('token_logprobs')[0]))
+        elif res.get('text') == '!':
+            return ('Human', abs(res.get('logprobs').get('token_logprobs')[0]))
+        return None #res.get('text')
--- a/test_openai_detect.py
+++ b/test_openai_detect.py
@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+import pytest, os
+from warnings import warn
+from openai_detect import run_on_file
+
+AI_SAMPLE_DIR = 'samples/llm-generated/'
+HUMAN_SAMPLE_DIR = 'samples/human-generated/'
+
+ai_files = os.listdir(AI_SAMPLE_DIR)
+ai_files = filter(lambda f: os.path.getsize(AI_SAMPLE_DIR + f) >= 1000, ai_files)
+human_files = os.listdir(HUMAN_SAMPLE_DIR)
+human_files = filter(lambda f: os.path.getsize(HUMAN_SAMPLE_DIR + f) >= 1000, human_files)
+
+def test_training_file():
+    assert run_on_file('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
+
+@pytest.mark.parametrize('f', human_files)
+def test_human_samples(f):
+    (classification, score) = run_on_file(HUMAN_SAMPLE_DIR + f)
+    assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
+
+@pytest.mark.parametrize('f', ai_files)
+def test_llm_sample(f):
+    (classification, score) = run_on_file(AI_SAMPLE_DIR + f)
+    assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))