rail-berkeley · qrico64 · Mar 28, 2025 · Apr 2, 2025 · Apr 3, 2025 · Apr 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,91 @@
+__pycache__/
+*.py[cod]
+
+examples/experiments/**/debug_*/
+examples/experiments/**/*.pkl
+examples/experiments/**/checkpoint_*
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.eggs/
+dist/
+build/
+sdist/
+*.egg-info/
+*.whl
+
+# Environments
+.venv/
+venv/
+env/
+ENV/
+*.env
+
+# PyInstaller
+#  Usually these files are written by a python script; excluding them is not always
+#  appropriate.
+#  https://docs.pyinstaller.org/en/stable/when-things-go-wrong.html
+# _MEIPASS*
+# _MEI*
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+nosetests.xml
+coverage.xml
+*.log
+*.rpt
+*.db
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.sqlite3
+*.sqlitedb
+local_settings.py
+/static/
+/media/
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx stuff:
+docs/_build
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# mkdocs
+site/
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.sock
+
+# VS Code
+.vscode/
+
+# PyCharm
+.idea/
+demo_data/
+
+examples/experiments/**/debug_*/
+examples/experiments/**/*.pkl
+examples/experiments/**/checkpoint_*
+**/*.out
diff --git a/check_server.sh b/check_server.sh
@@ -0,0 +1,21 @@
+echo "Getting position..."
+curl -X POST localhost:5000/getpos_euler
+
+echo ""
+echo "Activating gripper..."
+curl -X POST localhost:5000/activate_gripper
+echo ""
+echo ""
+
+echo "Closing gripper in 1s..."
+sleep 1
+
+curl -X POST localhost:5000/close_gripper
+echo ""
+echo ""
+
+echo "Opening gripper in 3s..."
+sleep 3
+
+curl -X POST localhost:5000/reset_gripper
+echo ""
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,94 @@
+name: serl2
+channels:
+  - defaults
+  - https://repo.anaconda.com/pkgs/main
+  - https://repo.anaconda.com/pkgs/r
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.2.25=h06a4308_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.16=h5eee18b_0
+  - pip=25.0=py310h06a4308_0
+  - python=3.10.16=he870216_1
+  - readline=8.2=h5eee18b_0
+  - setuptools=75.8.0=py310h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - tk=8.6.14=h39e8969_0
+  - tzdata=2025a=h04d1e81_0
+  - wheel=0.45.1=py310h06a4308_0
+  - xz=5.6.4=h5eee18b_1
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - absl-py==2.2.2
+      - blinker==1.9.0
+      - catkin-pkg==1.0.0
+      - certifi==2025.1.31
+      - cffi==1.17.1
+      - charset-normalizer==3.4.1
+      - click==8.1.8
+      - cloudpickle==3.1.1
+      - cython==3.0.12
+      - defusedxml==0.7.1
+      - distro==1.9.0
+      - docutils==0.21.2
+      - easyhid==0.0.10
+      - empy==4.2
+      - evdev==1.9.1
+      - flask==3.1.0
+      - gym==0.26.2
+      - gym-notices==0.0.8
+      - hidapi==0.14.0.post4
+      - idna==3.10
+      - itsdangerous==2.2.0
+      - jax==0.4.35
+      - jax-cuda12-pjrt==0.4.35
+      - jax-cuda12-plugin==0.4.35
+      - jaxlib==0.4.34
+      - jinja2==3.1.6
+      - lz4==4.4.4
+      - markupsafe==3.0.2
+      - ml-dtypes==0.5.1
+      - numpy==2.2.4
+      - nvidia-cublas-cu12==12.8.4.1
+      - nvidia-cuda-cupti-cu12==12.8.90
+      - nvidia-cuda-nvcc-cu12==12.8.93
+      - nvidia-cuda-runtime-cu12==12.8.90
+      - nvidia-cudnn-cu12==9.8.0.87
+      - nvidia-cufft-cu12==11.3.3.83
+      - nvidia-cusolver-cu12==11.7.3.90
+      - nvidia-cusparse-cu12==12.5.8.93
+      - nvidia-nccl-cu12==2.26.2
+      - nvidia-nvjitlink-cu12==12.8.93
+      - opencv-python==4.11.0.86
+      - opt-einsum==3.4.0
+      - pycparser==2.22
+      - pymodbus==2.5.3
+      - pynput==1.8.1
+      - pyopengl==3.1.9
+      - pyparsing==3.2.3
+      - pyquaternion==0.9.9
+      - pyrealsense2==2.55.1.6486
+      - pyserial==3.5
+      - pyspacemouse==1.1.4
+      - python-dateutil==2.9.0.post0
+      - python-xlib==0.33
+      - pyyaml==6.0.2
+      - pyzmq==26.3.0
+      - requests==2.32.3
+      - rospkg==1.6.0
+      - scipy==1.15.2
+      - six==1.17.0
+      - typing==3.7.4.3
+      - typing-extensions==4.13.1
+      - urllib3==2.3.0
+      - werkzeug==3.1.3
+      - zmq==0.0.0
+prefix: /home/robot/miniconda3/envs/serl
diff --git a/examples/analysis.py b/examples/analysis.py
@@ -0,0 +1,157 @@
+
+from absl import app, flags
+import time
+import numpy as np
+import os
+import pickle
+import imageio
+import cv2
+import queue
+from pynput import keyboard
+import threading
+from flax.training import checkpoints
+import jax
+import jax.numpy as jnp
+
+# from experiments.mappings import CONFIG_MAPPING
+from serl_launcher.agents.continuous.sac import SACAgent
+from serl_launcher.agents.continuous.sac_hybrid_single import SACAgentHybridSingleArm
+from serl_launcher.agents.continuous.sac_hybrid_dual import SACAgentHybridDualArm
+
+from serl_launcher.utils.launcher import (
+    make_sac_pixel_agent,
+    make_sac_pixel_agent_hybrid_single_arm,
+    make_sac_pixel_agent_hybrid_dual_arm,
+    make_trainer_config,
+    make_wandb_logger,
+)
+
+checkpoint_path = "/home/qirico/Desktop/All-Weird/Human-Interventions/jax-hitl-hil-serl/examples/experiments/franka_sim/debug_rlif_2"
+
+from experiments.config import DefaultTrainingConfig
+class TrainConfig(DefaultTrainingConfig):
+    image_keys = ["front", "wrist"]
+    classifier_keys = ["front", "wrist"]
+    proprio_keys = ['panda/tcp_pos', 'panda/tcp_vel', 'panda/gripper_pos']
+    # buffer_period = 1000
+    # checkpoint_period = 5000
+    # steps_per_update = 50
+    pretraining_steps = 0 # How many steps to pre-train the model for using RLPD on offline data only.
+    reward_scale = 1 # How much to scale actual rewards (not RLIF penalties) for RLIF.
+    rlif_minus_one = False
+    checkpoint_period = 2000
+    cta_ratio = 2
+    random_steps = 0
+    discount = 0.98
+    buffer_period = 1000
+    batch_size = 64
+    encoder_type = "resnet-pretrained"
+    setup_mode = "single-arm-learned-gripper"
+
+exp_name = "franka_sim"
+config = TrainConfig()
+# env = config.get_environment(fake_env=True,save_video=False,classifier=True)
+
+intervene_steps = 0  # Default number of steps between pre and post intervention states
+constraint_eps = 0.1  # Default constraint epsilon
+
+# obs_key_shapes = [('front', (1, 128, 128, 3)), ('state', (1, 7)), ('wrist', (1, 128, 128, 3))]
+obs_sample = {
+    'front': np.zeros((1, 128, 128, 3), dtype=np.uint8),
+    'state': np.zeros((1, 7), dtype=np.float32),
+    'wrist': np.zeros((1, 128, 128, 3), dtype=np.uint8),
+}
+action_sample = np.zeros(7, dtype=np.float32)
+
+agent: SACAgentHybridSingleArm = make_sac_pixel_agent_hybrid_single_arm(
+    seed=0,
+    sample_obs=obs_sample,
+    sample_action=action_sample,
+    image_keys=config.image_keys,
+    encoder_type=config.encoder_type,
+    discount=config.discount,
+    enable_cl=False,
+    soft_cl = False,
+    intervene_steps=intervene_steps,
+    constraint_eps=constraint_eps,
+)
+
+ckpt = checkpoints.restore_checkpoint(
+        os.path.abspath(checkpoint_path),
+        agent.state,
+        step='40000',
+    )
+agent = agent.replace(state=ckpt)
+
+
+preference_buffer_base_path="experiments/franka_sim/debug_rlif_2/interventions/transitions"
+preference_buffer_paths = [f"{preference_buffer_base_path}_{i}.pkl" for i in range(1000, 14000, 1000)]
+
+preference_buffer = []
+
+for preference_buffer_path in preference_buffer_paths:
+    if not os.path.exists(preference_buffer_path):
+        print(f"Preference buffer path {preference_buffer_path} does not exist.")
+        continue
+
+    # Load the preference buffer
+    with open(preference_buffer_path, 'rb') as f:
+        preference_buffer_part = pickle.load(f)
+        preference_buffer.extend(preference_buffer_part)
+
+rng = jax.random.PRNGKey(0)
+
+def get_action(obs, rng):
+    rng, key = jax.random.split(rng)
+    actions = agent.sample_actions(
+        observations=jax.device_put(obs),
+        argmax=True,
+        seed=key
+    )
+    return actions, rng
+
+pre_intervention_obs = [p['observations'][0] for p in preference_buffer]
+intervene_actions = [p['actions'][0] for p in preference_buffer]
+policy_actions = [p['policy_actions'][0] for p in preference_buffer]
+post_intervention_obs = [p['observations'][-1] for p in preference_buffer]
+
+pre_intervention_obs = {
+    'front': np.array([obs['front'] for obs in pre_intervention_obs]),
+    'state': np.array([obs['state'] for obs in pre_intervention_obs]),
+    'wrist': np.array([obs['wrist'] for obs in pre_intervention_obs]),
+}
+pre_intervention_expert_action, rng = get_action(pre_intervention_obs, rng)
+
+
+post_intervention_obs = {
+    'front': np.array([obs['front'] for obs in post_intervention_obs]),
+    'state': np.array([obs['state'] for obs in post_intervention_obs]),
+    'wrist': np.array([obs['wrist'] for obs in post_intervention_obs]),
+}
+post_intervention_expert_action, rng = get_action(post_intervention_obs, rng)
+
+policy_actions = np.array(policy_actions)
+intervene_actions = np.array(intervene_actions)
+
+key, rng =  jax.random.split(rng)
+q_pre_expert = agent.forward_critic(pre_intervention_obs, pre_intervention_expert_action[:, :6], key)
+key, rng =  jax.random.split(rng)
+q_post_expert = agent.forward_critic(post_intervention_obs, post_intervention_expert_action[:, :6], key)
+key, rng =  jax.random.split(rng)
+q_pre_policy = agent.forward_critic(pre_intervention_obs, policy_actions[:, :6], key)
+key, rng =  jax.random.split(rng)
+q_pre_intervene = agent.forward_critic(pre_intervention_obs, intervene_actions[:, :6], key)
+# q_post_expert = agent.q_network.apply(agent.state.params, post_intervention_obs, post_intervention_expert_action)
+
+# q_pre_policy = agent.q_network.apply(agent.state.params, pre_intervention_obs, policy_actions)
+# q_post_policy = agent.q_network.apply(agent.state.params, pre_intervention_obs, policy_actions)
+constraint1_acc = ((q_pre_expert - q_post_expert) < 0).mean()
+qvalue_based_learning_intervene = ((q_pre_policy - q_pre_intervene) < 0).mean()
+qvalue_based_learning_expert = ((q_pre_policy - q_pre_expert) < 0).mean()
+constraint2_acc = ((q_pre_intervene - q_post_expert) < 0).mean()
+
+print(f"Constraint 1 accuracy: {constraint1_acc}")
+print(f"Q-value based learning intervene accuracy: {qvalue_based_learning_intervene}")
+print(f"Q-value based learning expert accuracy: {qvalue_based_learning_expert}")
+print(f"Constraint 2 accuracy: {constraint2_acc}")
+breakpoint()