powers/Sundog-ML-Course: materials/GenAI/MakingData.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "470561fa",
   "metadata": {},
   "source": [
    "Install the latest OpenAI package..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b36942c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: openai in e:\\anaconda3\\lib\\site-packages (0.27.9)\n",
      "Requirement already satisfied: tqdm in e:\\anaconda3\\lib\\site-packages (from openai) (4.64.1)\n",
      "Requirement already satisfied: requests>=2.20 in e:\\anaconda3\\lib\\site-packages (from openai) (2.28.1)\n",
      "Requirement already satisfied: aiohttp in e:\\anaconda3\\lib\\site-packages (from openai) (3.8.3)\n",
      "Requirement already satisfied: idna<4,>=2.5 in e:\\anaconda3\\lib\\site-packages (from requests>=2.20->openai) (3.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in e:\\anaconda3\\lib\\site-packages (from requests>=2.20->openai) (2022.12.7)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in e:\\anaconda3\\lib\\site-packages (from requests>=2.20->openai) (1.26.13)\n",
      "Requirement already satisfied: charset-normalizer<3,>=2 in e:\\anaconda3\\lib\\site-packages (from requests>=2.20->openai) (2.1.1)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in e:\\anaconda3\\lib\\site-packages (from aiohttp->openai) (1.3.3)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in e:\\anaconda3\\lib\\site-packages (from aiohttp->openai) (1.3.1)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in e:\\anaconda3\\lib\\site-packages (from aiohttp->openai) (6.0.2)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in e:\\anaconda3\\lib\\site-packages (from aiohttp->openai) (4.0.2)\n",
      "Requirement already satisfied: attrs>=17.3.0 in e:\\anaconda3\\lib\\site-packages (from aiohttp->openai) (22.1.0)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in e:\\anaconda3\\lib\\site-packages (from aiohttp->openai) (1.8.1)\n",
      "Requirement already satisfied: colorama in e:\\anaconda3\\lib\\site-packages (from tqdm->openai) (0.4.6)\n"
     ]
    }
   ],
   "source": [
    "!pip install openai --upgrade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "00df62fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from openai import OpenAI\n",
    "client = OpenAI()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "276cb367",
   "metadata": {},
   "source": [
    "Upload our training and evaluation files, in chat completion format:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "638aa772",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<File file id=file-9lI2ovFA1UJskgOPpxDTwEhG at 0x2266872ea90> JSON: {\n",
       "  \"object\": \"file\",\n",
       "  \"id\": \"file-9lI2ovFA1UJskgOPpxDTwEhG\",\n",
       "  \"purpose\": \"fine-tune\",\n",
       "  \"filename\": \"file\",\n",
       "  \"bytes\": 1774941,\n",
       "  \"created_at\": 1692794707,\n",
       "  \"status\": \"uploaded\",\n",
       "  \"status_details\": null\n",
       "}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.files.create(\n",
    "  file=open(\"./DATA_train.jsonl\", \"rb\"),\n",
    "  purpose='fine-tune'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2d3da0ad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<File file id=file-UqPVnkk9z8Q74BEUqPlnhjHL at 0x226669e59f0> JSON: {\n",
       "  \"object\": \"file\",\n",
       "  \"id\": \"file-UqPVnkk9z8Q74BEUqPlnhjHL\",\n",
       "  \"purpose\": \"fine-tune\",\n",
       "  \"filename\": \"file\",\n",
       "  \"bytes\": 442619,\n",
       "  \"created_at\": 1692794711,\n",
       "  \"status\": \"uploaded\",\n",
       "  \"status_details\": null\n",
       "}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.files.create(\n",
    "  file=open(\"./DATA_eval.jsonl\", \"rb\"),\n",
    "  purpose='fine-tune'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b45dc0e1",
   "metadata": {},
   "source": [
    "Check the status of these files by copying in the returned ID's above. If there are JSON errors they will be reported here."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ffc4be66",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<File file id=file-UqPVnkk9z8Q74BEUqPlnhjHL at 0x2266865c180> JSON: {\n",
       "  \"object\": \"file\",\n",
       "  \"id\": \"file-UqPVnkk9z8Q74BEUqPlnhjHL\",\n",
       "  \"purpose\": \"fine-tune\",\n",
       "  \"filename\": \"file\",\n",
       "  \"bytes\": 442619,\n",
       "  \"created_at\": 1692794711,\n",
       "  \"status\": \"uploaded\",\n",
       "  \"status_details\": null\n",
       "}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.files.retrieve(\"file-UqPVnkk9z8Q74BEUqPlnhjHL\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0558d96b",
   "metadata": {},
   "source": [
    "Start our fine tuning job! Copy in the ID's for our uploaded training and validation files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7f83856c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<FineTuningJob fine_tuning.job id=ftjob-mQlhbPB5vsog1SeDLNx2xAMj at 0x226669e58b0> JSON: {\n",
       "  \"object\": \"fine_tuning.job\",\n",
       "  \"id\": \"ftjob-mQlhbPB5vsog1SeDLNx2xAMj\",\n",
       "  \"model\": \"gpt-3.5-turbo-0613\",\n",
       "  \"created_at\": 1692794886,\n",
       "  \"finished_at\": null,\n",
       "  \"fine_tuned_model\": null,\n",
       "  \"organization_id\": \"org-DBeDgDH8c36NSJobwuaBPXrW\",\n",
       "  \"result_files\": [],\n",
       "  \"status\": \"created\",\n",
       "  \"validation_file\": \"file-UqPVnkk9z8Q74BEUqPlnhjHL\",\n",
       "  \"training_file\": \"file-9lI2ovFA1UJskgOPpxDTwEhG\",\n",
       "  \"hyperparameters\": {\n",
       "    \"n_epochs\": 3\n",
       "  },\n",
       "  \"trained_tokens\": null\n",
       "}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.fine_tuning.jobs.create(training_file=\"file-9lI2ovFA1UJskgOPpxDTwEhG\", validation_file=\"file-UqPVnkk9z8Q74BEUqPlnhjHL\", model=\"gpt-3.5-turbo\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52ab0e27",
   "metadata": {},
   "source": [
    "Get general info about this job."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e6621b29",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<FineTuningJob fine_tuning.job id=ftjob-mQlhbPB5vsog1SeDLNx2xAMj at 0x22666a16220> JSON: {\n",
       "  \"object\": \"fine_tuning.job\",\n",
       "  \"id\": \"ftjob-mQlhbPB5vsog1SeDLNx2xAMj\",\n",
       "  \"model\": \"gpt-3.5-turbo-0613\",\n",
       "  \"created_at\": 1692794886,\n",
       "  \"finished_at\": null,\n",
       "  \"fine_tuned_model\": null,\n",
       "  \"organization_id\": \"org-DBeDgDH8c36NSJobwuaBPXrW\",\n",
       "  \"result_files\": [],\n",
       "  \"status\": \"running\",\n",
       "  \"validation_file\": \"file-UqPVnkk9z8Q74BEUqPlnhjHL\",\n",
       "  \"training_file\": \"file-9lI2ovFA1UJskgOPpxDTwEhG\",\n",
       "  \"hyperparameters\": {\n",
       "    \"n_epochs\": 3\n",
       "  },\n",
       "  \"trained_tokens\": null\n",
       "}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.fine_tuning.jobs.retrieve(\"ftjob-mQlhbPB5vsog1SeDLNx2xAMj\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f3c4e264",
   "metadata": {},
   "source": [
    "Monitor its progress. When done, you can use the resulting fine tuned model ID in the playground (or the API)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "a38f20e2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<OpenAIObject list at 0x22668759950> JSON: {\n",
       "  \"object\": \"list\",\n",
       "  \"data\": [\n",
       "    {\n",
       "      \"object\": \"fine_tuning.job.event\",\n",
       "      \"id\": \"ftevent-n0GA9lmPtAulghPIgsfSSdM1\",\n",
       "      \"created_at\": 1692797270,\n",
       "      \"level\": \"info\",\n",
       "      \"message\": \"Fine-tuning job successfully completed\",\n",
       "      \"data\": null,\n",
       "      \"type\": \"message\"\n",
       "    },\n",
       "    {\n",
       "      \"object\": \"fine_tuning.job.event\",\n",
       "      \"id\": \"ftevent-miblzvSktANikUk7sJOQe6Ir\",\n",
       "      \"created_at\": 1692797267,\n",
       "      \"level\": \"info\",\n",
       "      \"message\": \"New fine-tuned model created: ft:gpt-3.5-turbo-0613:sundog-software-llc::7qiBf2gI\",\n",
       "      \"data\": null,\n",
       "      \"type\": \"message\"\n",
       "    },\n",
       "    {\n",
       "      \"object\": \"fine_tuning.job.event\",\n",
       "      \"id\": \"ftevent-VVGJVV3Ss6N6ROQjK4xhib6T\",\n",
       "      \"created_at\": 1692796566,\n",
       "      \"level\": \"info\",\n",
       "      \"message\": \"Step 1000: training loss=1.89\",\n",
       "      \"data\": {\n",
       "        \"step\": 1000,\n",
       "        \"train_loss\": 1.8927490711212158,\n",
       "        \"train_mean_token_accuracy\": 0.5443037748336792\n",
       "      },\n",
       "      \"type\": \"metrics\"\n",
       "    },\n",
       "    {\n",
       "      \"object\": \"fine_tuning.job.event\",\n",
       "      \"id\": \"ftevent-QiOj8MeAHinAFG4DD3JIeEA1\",\n",
       "      \"created_at\": 1692794887,\n",
       "      \"level\": \"info\",\n",
       "      \"message\": \"Fine tuning job started\",\n",
       "      \"data\": null,\n",
       "      \"type\": \"message\"\n",
       "    },\n",
       "    {\n",
       "      \"object\": \"fine_tuning.job.event\",\n",
       "      \"id\": \"ftevent-ZS3NuX0GHccf7llea6HHPU8e\",\n",
       "      \"created_at\": 1692794886,\n",
       "      \"level\": \"info\",\n",
       "      \"message\": \"Created fine-tune: ftjob-mQlhbPB5vsog1SeDLNx2xAMj\",\n",
       "      \"data\": null,\n",
       "      \"type\": \"message\"\n",
       "    }\n",
       "  ],\n",
       "  \"has_more\": false\n",
       "}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.fine_tuning.jobs.list_events(id=\"ftjob-mQlhbPB5vsog1SeDLNx2xAMj\", limit=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "45e8edcb",
   "metadata": {},
   "source": [
    "For comparison, see how the non-fine-tuned GPT model does:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "feaaf07d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"role\": \"assistant\",\n",
      "  \"content\": \"DATA: Aye, Captain. Scanning for lifeforms. However, as an android, I do not possess the biological sensors typically used for such scans. Instead, I rely on my advanced sensory systems and tricorder readings to detect signs of life.\\n\\n(Pauses briefly)\\n\\nDATA: I am detecting several lifeforms in the vicinity, Captain. Shall I provide detailed analysis?\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "completion = client.chat.completions.create(\n",
    "  model=\"gpt-3.5-turbo\",\n",
    "  messages=[\n",
    "    {\"role\": \"system\", \"content\": \"Data is an android in the TV series Star Trek: The Next Generation.\"},\n",
    "    {\"role\": \"user\", \"content\": \"PICARD: Mr. Data, scan for lifeforms.\"}\n",
    "  ]\n",
    ")\n",
    "\n",
    "print(completion.choices[0].message)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ac0f853",
   "metadata": {},
   "source": [
    "When it's done, try our fine-tuned model! Copy in our fine tuned ID."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "1249edde",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"role\": \"assistant\",\n",
      "  \"content\": \"DATA: I am finding very low levels of radioactivity -- I cannot make a conclusive scan for lifeforms, Captain.\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "completion = client.chat.completions.create(\n",
    "  model=\"ft:gpt-3.5-turbo-0613:sundog-software-llc::7qiBf2gI\",\n",
    "  messages=[\n",
    "    {\"role\": \"system\", \"content\": \"Data is an android in the TV series Star Trek: The Next Generation.\"},\n",
    "    {\"role\": \"user\", \"content\": \"PICARD: Mr. Data, scan for lifeforms.\"}\n",
    "  ]\n",
    ")\n",
    "\n",
    "print(completion.choices[0].message)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82066210",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}