<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>arbdwj</title>
    <description>fullwrong</description>
    <link>http://arbdwj.github.io/</link>
    <atom:link href="http://arbdwj.github.io/feed.xml" rel="self" type="application/rss+xml" />
    <pubDate>Thu, 02 Jul 2026 12:46:50 +0000</pubDate>
    <lastBuildDate>Thu, 02 Jul 2026 12:46:50 +0000</lastBuildDate>
    <generator>Jekyll v3.10.0</generator>
    
      <item>
        <title>Tracing Eval-Awareness Emergence Through Training of OLMo 3</title>
        <description>&lt;p&gt;A post on LessWrong tracing how a model’s awareness of being evaluated emerges across the training stages of OLMo 3: negligible during pretraining, rising with supervised fine-tuning, dropping with DPO, and rising again substantially during RL.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://www.lesswrong.com/posts/c2tqL9xPbttisAHtt/tracing-eval-awareness-emergence-through-training-of-olmo-3&quot;&gt;Read it on LessWrong →&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Wed, 10 Jun 2026 12:00:00 +0000</pubDate>
        <link>http://arbdwj.github.io/2026/06/10/tracing-eval-awareness-olmo3/</link>
        <guid isPermaLink="true">http://arbdwj.github.io/2026/06/10/tracing-eval-awareness-olmo3/</guid>
        
        
      </item>
    
      <item>
        <title>Scaling Laws for LLM-Based Data Compression</title>
        <description>&lt;p&gt;How LLMs compress text, image, and speech, and the universal power laws that govern compression ratio as a function of model and data scale.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://www.lesswrong.com/posts/rWBYQXJTHBmCBCLEk/scaling-laws-for-llm-based-data-compression&quot;&gt;Read it on LessWrong →&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Wed, 23 Jul 2025 10:00:00 +0000</pubDate>
        <link>http://arbdwj.github.io/2025/07/23/scaling-compression/</link>
        <guid isPermaLink="true">http://arbdwj.github.io/2025/07/23/scaling-compression/</guid>
        
        
      </item>
    
      <item>
        <title>Experiments with the Platonic Representation Hypothesis</title>
        <description>&lt;p&gt;Testing whether the Platonic Representation Hypothesis — that models across modalities converge to a shared statistical model of reality — still holds once you move beyond in-distribution data.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://www.lesswrong.com/posts/Su2pg7iwBM55yjQdt/exploring-the-platonic-representation-hypothesis-beyond-in&quot;&gt;Read it on LessWrong →&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Sun, 27 Oct 2024 10:00:00 +0000</pubDate>
        <link>http://arbdwj.github.io/2024/10/27/prh-ood/</link>
        <guid isPermaLink="true">http://arbdwj.github.io/2024/10/27/prh-ood/</guid>
        
        
      </item>
    
      <item>
        <title>Understanding Hidden Computations in Chain-of-Thought Reasoning</title>
        <description>&lt;p&gt;Investigating how transformers keep reasoning when chain-of-thought steps are replaced by filler tokens, using the 3SUM task — and a modified greedy decoding scheme that recovers the hidden computation with 100% consistency.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://www.lesswrong.com/posts/3duPFzDDX8myY6q4K/understanding-hidden-computations-in-chain-of-thought&quot;&gt;Read it on LessWrong →&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Wed, 28 Aug 2024 01:49:00 +0000</pubDate>
        <link>http://arbdwj.github.io/2024/08/28/fillertokens/</link>
        <guid isPermaLink="true">http://arbdwj.github.io/2024/08/28/fillertokens/</guid>
        
        
      </item>
    
      <item>
        <title>Adversarial training against goal misgeneralization is ELK-hard</title>
        <description>&lt;p&gt;An argument that solving goal misgeneralization in the worst case reduces to Eliciting Latent Knowledge: any adversarial-training scheme that relies on a non-deceiving prediction head runs straight into the ELK problem.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://www.lesswrong.com/posts/MWSCqzPrAbNrYoqWv/adversarial-training-against-goal-misgeneralization-is-elk&quot;&gt;Read it on LessWrong →&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Fri, 24 Mar 2023 10:00:00 +0000</pubDate>
        <link>http://arbdwj.github.io/2023/03/24/goalmisgen/</link>
        <guid isPermaLink="true">http://arbdwj.github.io/2023/03/24/goalmisgen/</guid>
        
        
      </item>
    
      <item>
        <title>The AGI needs to be honest</title>
        <description>&lt;p&gt;On why certifying that a superintelligent system is &lt;em&gt;intelligent&lt;/em&gt; first requires proving that it is &lt;em&gt;honest&lt;/em&gt; — and why honesty is the hard-to-find global optimum among easy-to-find deceptive ones.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://www.lesswrong.com/posts/hqzHbew35Jx4xoDhE/the-agi-needs-to-be-honest&quot;&gt;Read it on LessWrong →&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Sat, 16 Oct 2021 10:00:00 +0000</pubDate>
        <link>http://arbdwj.github.io/2021/10/16/truth/</link>
        <guid isPermaLink="true">http://arbdwj.github.io/2021/10/16/truth/</guid>
        
        
      </item>
    
  </channel>
</rss>
