Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 241 results for author: <span class="mathjax">Hao, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Hao%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hao, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hao%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hao, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hao%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08524">arXiv:2502.08524</a> <span> [<a href="https://arxiv.org/pdf/2502.08524">pdf</a>, <a href="https://arxiv.org/format/2502.08524">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM Pretraining with Continuous Concepts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tack%2C+J">Jihoon Tack</a>, <a href="/search/cs?searchtype=author&query=Lanchantin%2C+J">Jack Lanchantin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jane Yu</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+A">Andrew Cohen</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Janice Lan</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08524v1-abstract-short" style="display: inline;"> Next token prediction has been the standard training objective used in large language model pretraining. Representations are learned as a result of optimizing for token-level perplexity. We propose Continuous Concept Mixing (CoCoMix), a novel pretraining framework that combines discrete next token prediction with continuous concepts. Specifically, CoCoMix predicts continuous concepts learned from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08524v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08524v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08524v1-abstract-full" style="display: none;"> Next token prediction has been the standard training objective used in large language model pretraining. Representations are learned as a result of optimizing for token-level perplexity. We propose Continuous Concept Mixing (CoCoMix), a novel pretraining framework that combines discrete next token prediction with continuous concepts. Specifically, CoCoMix predicts continuous concepts learned from a pretrained sparse autoencoder and mixes them into the model's hidden state by interleaving with token hidden representations. Through experiments on multiple benchmarks, including language modeling and downstream reasoning tasks, we show that CoCoMix is more sample efficient and consistently outperforms standard next token prediction, knowledge distillation and inserting pause tokens. We find that combining both concept learning and interleaving in an end-to-end framework is critical to performance gains. Furthermore, CoCoMix enhances interpretability and steerability by allowing direct inspection and modification of the predicted concept, offering a transparent way to guide the model's internal reasoning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08524v1-abstract-full').style.display = 'none'; document.getElementById('2502.08524v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06734">arXiv:2502.06734</a> <span> [<a href="https://arxiv.org/pdf/2502.06734">pdf</a>, <a href="https://arxiv.org/format/2502.06734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Se帽orita-2M: A High-Quality Instruction-based Dataset for General Video Editing by Video Specialists </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zi%2C+B">Bojia Zi</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+P">Penghui Ruan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Marco Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shihao Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Youze Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Bin Liang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Rong Xiao</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kam-Fai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06734v1-abstract-short" style="display: inline;"> Recent advancements in video generation have spurred the development of video editing techniques, which can be divided into inversion-based and end-to-end methods. However, current video editing methods still suffer from several challenges. Inversion-based methods, though training-free and flexible, are time-consuming during inference, struggle with fine-grained editing instructions, and produce a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06734v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06734v1-abstract-full" style="display: none;"> Recent advancements in video generation have spurred the development of video editing techniques, which can be divided into inversion-based and end-to-end methods. However, current video editing methods still suffer from several challenges. Inversion-based methods, though training-free and flexible, are time-consuming during inference, struggle with fine-grained editing instructions, and produce artifacts and jitter. On the other hand, end-to-end methods, which rely on edited video pairs for training, offer faster inference speeds but often produce poor editing results due to a lack of high-quality training video pairs. In this paper, to close the gap in end-to-end methods, we introduce Se帽orita-2M, a high-quality video editing dataset. Se帽orita-2M consists of approximately 2 millions of video editing pairs. It is built by crafting four high-quality, specialized video editing models, each crafted and trained by our team to achieve state-of-the-art editing results. We also propose a filtering pipeline to eliminate poorly edited video pairs. Furthermore, we explore common video editing architectures to identify the most effective structure based on current pre-trained generative model. Extensive experiments show that our dataset can help to yield remarkably high-quality video editing results. More details are available at https://senorita.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06734v1-abstract-full').style.display = 'none'; document.getElementById('2502.06734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05271">arXiv:2502.05271</a> <span> [<a href="https://arxiv.org/pdf/2502.05271">pdf</a>, <a href="https://arxiv.org/format/2502.05271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RobotMover: Learning to Move Large Objects by Imitating the Dynamic Chain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&query=Truong%2C+J">Joanne Truong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jimmy Yang</a>, <a href="/search/cs?searchtype=author&query=Clegg%2C+A">Alexander Clegg</a>, <a href="/search/cs?searchtype=author&query=Rai%2C+A">Akshara Rai</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a>, <a href="/search/cs?searchtype=author&query=Puig%2C+X">Xavier Puig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05271v1-abstract-short" style="display: inline;"> Moving large objects, such as furniture, is a critical capability for robots operating in human environments. This task presents significant challenges due to two key factors: the need to synchronize whole-body movements to prevent collisions between the robot and the object, and the under-actuated dynamics arising from the substantial size and weight of the objects. These challenges also complica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05271v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05271v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05271v1-abstract-full" style="display: none;"> Moving large objects, such as furniture, is a critical capability for robots operating in human environments. This task presents significant challenges due to two key factors: the need to synchronize whole-body movements to prevent collisions between the robot and the object, and the under-actuated dynamics arising from the substantial size and weight of the objects. These challenges also complicate performing these tasks via teleoperation. In this work, we introduce \method, a generalizable learning framework that leverages human-object interaction demonstrations to enable robots to perform large object manipulation tasks. Central to our approach is the Dynamic Chain, a novel representation that abstracts human-object interactions so that they can be retargeted to robotic morphologies. The Dynamic Chain is a spatial descriptor connecting the human and object root position via a chain of nodes, which encode the position and velocity of different interaction keypoints. We train policies in simulation using Dynamic-Chain-based imitation rewards and domain randomization, enabling zero-shot transfer to real-world settings without fine-tuning. Our approach outperforms both learning-based methods and teleoperation baselines across six evaluation metrics when tested on three distinct object types, both in simulation and on physical hardware. Furthermore, we successfully apply the learned policies to real-world tasks, such as moving a trash cart and rearranging chairs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05271v1-abstract-full').style.display = 'none'; document.getElementById('2502.05271v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04520">arXiv:2502.04520</a> <span> [<a href="https://arxiv.org/pdf/2502.04520">pdf</a>, <a href="https://arxiv.org/format/2502.04520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Linear Correlation in LM's Compositional Generalization and Hallucination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+L">Letian Peng</a>, <a href="/search/cs?searchtype=author&query=An%2C+C">Chenyang An</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+C">Chengyu Dong</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jingbo Shang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04520v1-abstract-short" style="display: inline;"> The generalization of language models (LMs) is undergoing active debates, contrasting their potential for general intelligence with their struggles with basic knowledge composition (e.g., reverse/transition curse). This paper uncovers the phenomenon of linear correlations in LMs during knowledge composition. For explanation, there exists a linear transformation between certain related knowledge th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04520v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04520v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04520v1-abstract-full" style="display: none;"> The generalization of language models (LMs) is undergoing active debates, contrasting their potential for general intelligence with their struggles with basic knowledge composition (e.g., reverse/transition curse). This paper uncovers the phenomenon of linear correlations in LMs during knowledge composition. For explanation, there exists a linear transformation between certain related knowledge that maps the next token prediction logits from one prompt to another, e.g., "X lives in the city of" $\rightarrow$ "X lives in the country of" for every given X. This mirrors the linearity in human knowledge composition, such as Paris $\rightarrow$ France. Our findings indicate that the linear transformation is resilient to large-scale fine-tuning, generalizing updated knowledge when aligned with real-world relationships, but causing hallucinations when it deviates. Empirical results suggest that linear correlation can serve as a potential identifier of LM's generalization. Finally, we show such linear correlations can be learned with a single feedforward network and pre-trained vocabulary representations, indicating LM generalization heavily relies on the latter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04520v1-abstract-full').style.display = 'none'; document.getElementById('2502.04520v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09235">arXiv:2501.09235</a> <span> [<a href="https://arxiv.org/pdf/2501.09235">pdf</a>, <a href="https://arxiv.org/format/2501.09235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> The Spread of Virtual Gifting in Live Streaming: The Case of Twitch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J+E">Ji Eun Kim</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Seura Ha</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangmi Kim</a>, <a href="/search/cs?searchtype=author&query=Hemphill%2C+L">Libby Hemphill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09235v1-abstract-short" style="display: inline;"> This paper examines how gifting spreads among viewers on Twitch, one of the largest live streaming platforms worldwide. Twitch users can give gift subscriptions to other viewers in the chat room, with the majority of gifters opting for community gifting, which is gifting to randomly selected viewers. We identify the random nature of gift-receiving in our data as a natural experiment setting. We in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09235v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09235v1-abstract-full" style="display: none;"> This paper examines how gifting spreads among viewers on Twitch, one of the largest live streaming platforms worldwide. Twitch users can give gift subscriptions to other viewers in the chat room, with the majority of gifters opting for community gifting, which is gifting to randomly selected viewers. We identify the random nature of gift-receiving in our data as a natural experiment setting. We investigate whether gift recipients pay it forward, considering various gift types that may either promote or deter the spread of gifting. Our findings reveal that Twitch viewers who receive gift subscriptions are generally more likely to pay it forward than non-recipients, and the positive impact of gift-receiving becomes stronger when the recipient is the sole beneficiary of the giver's gifting behavior. However, we found that gifts from frequent gifters discourage recipients from paying it forward, and gifts from anonymous gifters do not influence the likelihood of viewers becoming future gifters. This research contributes to the existing literature on the spread of online prosocial behavior by providing robust evidence and suggests practical strategies for promoting online gifting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09235v1-abstract-full').style.display = 'none'; document.getElementById('2501.09235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08670">arXiv:2501.08670</a> <span> [<a href="https://arxiv.org/pdf/2501.08670">pdf</a>, <a href="https://arxiv.org/format/2501.08670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Augmenting Smart Contract Decompiler Output through Fine-grained Dependency Analysis and LLM-facilitated Semantic Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zeqin Liao</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yuhong Nan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zixu Gao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Henglong Liang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Sicheng Hao</a>, <a href="/search/cs?searchtype=author&query=Reng%2C+P">Peifan Reng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08670v1-abstract-short" style="display: inline;"> Decompiler is a specialized type of reverse engineering tool extensively employed in program analysis tasks, particularly in program comprehension and vulnerability detection. However, current Solidity smart contract decompilers face significant limitations in reconstructing the original source code. In particular, the bottleneck of SOTA decompilers lies in inaccurate method identification, incorr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08670v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08670v1-abstract-full" style="display: none;"> Decompiler is a specialized type of reverse engineering tool extensively employed in program analysis tasks, particularly in program comprehension and vulnerability detection. However, current Solidity smart contract decompilers face significant limitations in reconstructing the original source code. In particular, the bottleneck of SOTA decompilers lies in inaccurate method identification, incorrect variable type recovery, and missing contract attributes. These deficiencies hinder downstream tasks and understanding of the program logic. To address these challenges, we propose SmartHalo, a new framework that enhances decompiler output by combining static analysis (SA) and large language models (LLM). SmartHalo leverages the complementary strengths of SA's accuracy in control and data flow analysis and LLM's capability in semantic prediction. More specifically, \system{} constructs a new data structure - Dependency Graph (DG), to extract semantic dependencies via static analysis. Then, it takes DG to create prompts for LLM optimization. Finally, the correctness of LLM outputs is validated through symbolic execution and formal verification. Evaluation on a dataset consisting of 465 randomly selected smart contract methods shows that SmartHalo significantly improves the quality of the decompiled code, compared to SOTA decompilers (e.g., Gigahorse). Notably, integrating GPT-4o with SmartHalo further enhances its performance, achieving precision rates of 87.39% for method boundaries, 90.39% for variable types, and 80.65% for contract attributes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08670v1-abstract-full').style.display = 'none'; document.getElementById('2501.08670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06481">arXiv:2501.06481</a> <span> [<a href="https://arxiv.org/pdf/2501.06481">pdf</a>, <a href="https://arxiv.org/format/2501.06481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Focus-N-Fix: Region-Aware Fine-Tuning for Text-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xiaoying Xing</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+A">Avinab Saha</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junfeng He</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Susan Hao</a>, <a href="/search/cs?searchtype=author&query=Vicol%2C+P">Paul Vicol</a>, <a href="/search/cs?searchtype=author&query=Ryu%2C+M">Moonkyung Ryu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Singla%2C+S">Sahil Singla</a>, <a href="/search/cs?searchtype=author&query=Young%2C+S">Sarah Young</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinxiao Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Feng Yang</a>, <a href="/search/cs?searchtype=author&query=Ramachandran%2C+D">Deepak Ramachandran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06481v1-abstract-short" style="display: inline;"> Text-to-image (T2I) generation has made significant advances in recent years, but challenges still remain in the generation of perceptual artifacts, misalignment with complex prompts, and safety. The prevailing approach to address these issues involves collecting human feedback on generated images, training reward models to estimate human feedback, and then fine-tuning T2I models based on the rewa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06481v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06481v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06481v1-abstract-full" style="display: none;"> Text-to-image (T2I) generation has made significant advances in recent years, but challenges still remain in the generation of perceptual artifacts, misalignment with complex prompts, and safety. The prevailing approach to address these issues involves collecting human feedback on generated images, training reward models to estimate human feedback, and then fine-tuning T2I models based on the reward models to align them with human preferences. However, while existing reward fine-tuning methods can produce images with higher rewards, they may change model behavior in unexpected ways. For example, fine-tuning for one quality aspect (e.g., safety) may degrade other aspects (e.g., prompt alignment), or may lead to reward hacking (e.g., finding a way to increase rewards without having the intended effect). In this paper, we propose Focus-N-Fix, a region-aware fine-tuning method that trains models to correct only previously problematic image regions. The resulting fine-tuned model generates images with the same high-level structure as the original model but shows significant improvements in regions where the original model was deficient in safety (over-sexualization and violence), plausibility, or other criteria. Our experiments demonstrate that Focus-N-Fix improves these localized quality aspects with little or no degradation to others and typically imperceptible changes in the rest of the image. Disclaimer: This paper contains images that may be overly sexual, violent, offensive, or harmful. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06481v1-abstract-full').style.display = 'none'; document.getElementById('2501.06481v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05155">arXiv:2501.05155</a> <span> [<a href="https://arxiv.org/pdf/2501.05155">pdf</a>, <a href="https://arxiv.org/format/2501.05155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Biomedical Relation Extraction via Adaptive Document-Relation Cross-Mapping and Concept Unique Identifier </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yufei Shang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanrong Guo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shijie Hao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+R">Richang Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05155v1-abstract-short" style="display: inline;"> Document-Level Biomedical Relation Extraction (Bio-RE) aims to identify relations between biomedical entities within extensive texts, serving as a crucial subfield of biomedical text mining. Existing Bio-RE methods struggle with cross-sentence inference, which is essential for capturing relations spanning multiple sentences. Moreover, previous methods often overlook the incompleteness of documents… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05155v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05155v1-abstract-full" style="display: none;"> Document-Level Biomedical Relation Extraction (Bio-RE) aims to identify relations between biomedical entities within extensive texts, serving as a crucial subfield of biomedical text mining. Existing Bio-RE methods struggle with cross-sentence inference, which is essential for capturing relations spanning multiple sentences. Moreover, previous methods often overlook the incompleteness of documents and lack the integration of external knowledge, limiting contextual richness. Besides, the scarcity of annotated data further hampers model training. Recent advancements in large language models (LLMs) have inspired us to explore all the above issues for document-level Bio-RE. Specifically, we propose a document-level Bio-RE framework via LLM Adaptive Document-Relation Cross-Mapping (ADRCM) Fine-Tuning and Concept Unique Identifier (CUI) Retrieval-Augmented Generation (RAG). First, we introduce the Iteration-of-REsummary (IoRs) prompt for solving the data scarcity issue. In this way, Bio-RE task-specific synthetic data can be generated by guiding ChatGPT to focus on entity relations and iteratively refining synthetic data. Next, we propose ADRCM fine-tuning, a novel fine-tuning recipe that establishes mappings across different documents and relations, enhancing the model's contextual understanding and cross-sentence inference capabilities. Finally, during the inference, a biomedical-specific RAG approach, named CUI RAG, is designed to leverage CUIs as indexes for entities, narrowing the retrieval scope and enriching the relevant document contexts. Experiments conducted on three Bio-RE datasets (GDA, CDR, and BioRED) demonstrate the state-of-the-art performance of our proposed method by comparing it with other related works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05155v1-abstract-full').style.display = 'none'; document.getElementById('2501.05155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04594">arXiv:2501.04594</a> <span> [<a href="https://arxiv.org/pdf/2501.04594">pdf</a>, <a href="https://arxiv.org/format/2501.04594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Understanding Expectations for a Robotic Guide Dog for Visually Impaired People </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J+T">J. Taery Kim</a>, <a href="/search/cs?searchtype=author&query=Byrd%2C+M">Morgan Byrd</a>, <a href="/search/cs?searchtype=author&query=Crandell%2C+J+L">Jack L. Crandell</a>, <a href="/search/cs?searchtype=author&query=Walker%2C+B+N">Bruce N. Walker</a>, <a href="/search/cs?searchtype=author&query=Turk%2C+G">Greg Turk</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04594v1-abstract-short" style="display: inline;"> Robotic guide dogs hold significant potential to enhance the autonomy and mobility of blind or visually impaired (BVI) individuals by offering universal assistance over unstructured terrains at affordable costs. However, the design of robotic guide dogs remains underexplored, particularly in systematic aspects such as gait controllers, navigation behaviors, interaction methods, and verbal explanat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04594v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04594v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04594v1-abstract-full" style="display: none;"> Robotic guide dogs hold significant potential to enhance the autonomy and mobility of blind or visually impaired (BVI) individuals by offering universal assistance over unstructured terrains at affordable costs. However, the design of robotic guide dogs remains underexplored, particularly in systematic aspects such as gait controllers, navigation behaviors, interaction methods, and verbal explanations. Our study addresses this gap by conducting user studies with 18 BVI participants, comprising 15 cane users and three guide dog users. Participants interacted with a quadrupedal robot and provided both quantitative and qualitative feedback. Our study revealed several design implications, such as a preference for a learning-based controller and a rigid handle, gradual turns with asymmetric speeds, semantic communication methods, and explainability. The study also highlighted the importance of customization to support users with diverse backgrounds and preferences, along with practical concerns such as battery life, maintenance, and weather issues. These findings offer valuable insights and design implications for future research and development of robotic guide dogs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04594v1-abstract-full').style.display = 'none'; document.getElementById('2501.04594v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 4 figures, Proceedings of the 2025 ACM/IEEE International Conference on Human-Robot Interaction (HRI'25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00328">arXiv:2501.00328</a> <span> [<a href="https://arxiv.org/pdf/2501.00328">pdf</a>, <a href="https://arxiv.org/format/2501.00328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoxVietnam: a Large-Scale Multi-Genre Dataset for Vietnamese Speaker Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vu%2C+H+L">Hoang Long Vu</a>, <a href="/search/cs?searchtype=author&query=Dat%2C+P+T">Phuong Tuan Dat</a>, <a href="/search/cs?searchtype=author&query=Nhi%2C+P+T">Pham Thao Nhi</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+N+S">Nguyen Song Hao</a>, <a href="/search/cs?searchtype=author&query=Trang%2C+N+T+T">Nguyen Thi Thu Trang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00328v1-abstract-short" style="display: inline;"> Recent research in speaker recognition aims to address vulnerabilities due to variations between enrolment and test utterances, particularly in the multi-genre phenomenon where the utterances are in different speech genres. Previous resources for Vietnamese speaker recognition are either limited in size or do not focus on genre diversity, leaving studies in multi-genre effects unexplored. This pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00328v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00328v1-abstract-full" style="display: none;"> Recent research in speaker recognition aims to address vulnerabilities due to variations between enrolment and test utterances, particularly in the multi-genre phenomenon where the utterances are in different speech genres. Previous resources for Vietnamese speaker recognition are either limited in size or do not focus on genre diversity, leaving studies in multi-genre effects unexplored. This paper introduces VoxVietnam, the first multi-genre dataset for Vietnamese speaker recognition with over 187,000 utterances from 1,406 speakers and an automated pipeline to construct a dataset on a large scale from public sources. Our experiments show the challenges posed by the multi-genre phenomenon to models trained on a single-genre dataset, and demonstrate a significant increase in performance upon incorporating the VoxVietnam into the training process. Our experiments are conducted to study the challenges of the multi-genre phenomenon in speaker recognition and the performance gain when the proposed dataset is used for multi-genre training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00328v1-abstract-full').style.display = 'none'; document.getElementById('2501.00328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16834">arXiv:2412.16834</a> <span> [<a href="https://arxiv.org/pdf/2412.16834">pdf</a>, <a href="https://arxiv.org/ps/2412.16834">ps</a>, <a href="https://arxiv.org/format/2412.16834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Online Learning from Strategic Human Feedback in LLM Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shugang Hao</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L">Lingjie Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16834v2-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) has become an essential step in fine-tuning large language models (LLMs) to align them with human preferences. However, human labelers are selfish and have diverse preferences. They may strategically misreport their online feedback to influence the system's aggregation towards their own preferences. Current practice simply averages labelers' feedba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16834v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16834v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16834v2-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) has become an essential step in fine-tuning large language models (LLMs) to align them with human preferences. However, human labelers are selfish and have diverse preferences. They may strategically misreport their online feedback to influence the system's aggregation towards their own preferences. Current practice simply averages labelers' feedback per time and fails to identify the most accurate human labeler, leading to linear regret $\mathcal{O}(T)$ for $T$ time slots. To our best knowledge, we are the first to study online learning mechanisms against strategic human labelers in the LLM fine-tuning process. We formulate a new dynamic Bayesian game and dynamically adjust human labelers' weights in the preference aggregation, ensuring their truthful feedback and sublinear regret $\mathcal{O}(T^{1/2})$. Simulation results demonstrate our mechanism's great advantages over the existing benchmark schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16834v2-abstract-full').style.display = 'none'; document.getElementById('2412.16834v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16830">arXiv:2412.16830</a> <span> [<a href="https://arxiv.org/pdf/2412.16830">pdf</a>, <a href="https://arxiv.org/ps/2412.16830">ps</a>, <a href="https://arxiv.org/format/2412.16830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Algorithm Design for Continual Learning in IoT Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shugang Hao</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L">Lingjie Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16830v2-abstract-short" style="display: inline;"> Continual learning (CL) is a new online learning technique over sequentially generated streaming data from different tasks, aiming to maintain a small forgetting loss on previously-learned tasks. Existing work focuses on reducing the forgetting loss under a given task sequence. However, if similar tasks continuously appear to the end time, the forgetting loss is still huge on prior distinct tasks.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16830v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16830v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16830v2-abstract-full" style="display: none;"> Continual learning (CL) is a new online learning technique over sequentially generated streaming data from different tasks, aiming to maintain a small forgetting loss on previously-learned tasks. Existing work focuses on reducing the forgetting loss under a given task sequence. However, if similar tasks continuously appear to the end time, the forgetting loss is still huge on prior distinct tasks. In practical IoT networks, an autonomous vehicle to sample data and learn different tasks can route and alter the order of task pattern at increased travelling cost. To our best knowledge, we are the first to study how to opportunistically route the testing object and alter the task sequence in CL. We formulate a new optimization problem and prove it NP-hard. We propose a polynomial-time algorithm to achieve approximation ratios of $\frac{3}{2}$ for underparameterized case and $\frac{3}{2} + r^{1-T}$ for overparameterized case, respectively, where $r:=1-\frac{n}{m}$ is a parameter of feature number $m$ and sample number $n$ and $T$ is the task number. Simulation results verify our algorithm's close-to-optimum performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16830v2-abstract-full').style.display = 'none'; document.getElementById('2412.16830v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16145">arXiv:2412.16145</a> <span> [<a href="https://arxiv.org/pdf/2412.16145">pdf</a>, <a href="https://arxiv.org/format/2412.16145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Offline Reinforcement Learning for LLM Multi-Step Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huaijie Wang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hanze Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shenao Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yilin Bao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziran Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16145v2-abstract-short" style="display: inline;"> Improving the multi-step reasoning ability of large language models (LLMs) with offline reinforcement learning (RL) is essential for quickly adapting them to complex tasks. While Direct Preference Optimization (DPO) has shown promise in aligning LLMs with human preferences, it is less suitable for multi-step reasoning tasks because (1) DPO relies on paired preference data, which is not readily ava… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16145v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16145v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16145v2-abstract-full" style="display: none;"> Improving the multi-step reasoning ability of large language models (LLMs) with offline reinforcement learning (RL) is essential for quickly adapting them to complex tasks. While Direct Preference Optimization (DPO) has shown promise in aligning LLMs with human preferences, it is less suitable for multi-step reasoning tasks because (1) DPO relies on paired preference data, which is not readily available for multi-step reasoning tasks, and (2) it treats all tokens uniformly, making it ineffective for credit assignment in multi-step reasoning tasks, which often come with sparse reward. In this work, we propose OREO (Offline Reasoning Optimization), an offline RL method for enhancing LLM multi-step reasoning. Building on insights from previous works of maximum entropy reinforcement learning, it jointly learns a policy model and value function by optimizing the soft Bellman Equation. We show in principle that it reduces the need to collect pairwise data and enables better credit assignment. Empirically, OREO surpasses existing offline learning methods on multi-step reasoning benchmarks, including mathematical reasoning tasks (GSM8K, MATH) and embodied agent control (ALFWorld). The approach can be extended to a multi-iteration framework when additional resources are available. Furthermore, the learned value function can be leveraged to guide the tree search for free, which can further boost performance during test time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16145v2-abstract-full').style.display = 'none'; document.getElementById('2412.16145v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10436">arXiv:2412.10436</a> <span> [<a href="https://arxiv.org/pdf/2412.10436">pdf</a>, <a href="https://arxiv.org/format/2412.10436">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Federated Learning for Semantic Datasets: Federated Scene Graph Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ha%2C+S">SeungBum Ha</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taehwan Lee</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+J">Jiyoun Lim</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S+W">Sung Whan Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10436v1-abstract-short" style="display: inline;"> Federated learning (FL) has recently garnered attention as a data-decentralized training framework that enables the learning of deep models from locally distributed samples while keeping data privacy. Built upon the framework, immense efforts have been made to establish FL benchmarks, which provide rigorous evaluation settings that control data heterogeneity across clients. Prior efforts have main… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10436v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10436v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10436v1-abstract-full" style="display: none;"> Federated learning (FL) has recently garnered attention as a data-decentralized training framework that enables the learning of deep models from locally distributed samples while keeping data privacy. Built upon the framework, immense efforts have been made to establish FL benchmarks, which provide rigorous evaluation settings that control data heterogeneity across clients. Prior efforts have mainly focused on handling relatively simple classification tasks, where each sample is annotated with a one-hot label, such as MNIST, CIFAR, LEAF benchmark, etc. However, little attention has been paid to demonstrating an FL benchmark that handles complicated semantics, where each sample encompasses diverse semantic information from multiple labels, such as Panoptic Scene Graph Generation (PSG) with objects, subjects, and relations between them. Because the existing benchmark is designed to distribute data in a narrow view of a single semantic, e.g., a one-hot label, managing the complicated semantic heterogeneity across clients when formalizing FL benchmarks is non-trivial. In this paper, we propose a benchmark process to establish an FL benchmark with controllable semantic heterogeneity across clients: two key steps are i) data clustering with semantics and ii) data distributing via controllable semantic heterogeneity across clients. As a proof of concept, we first construct a federated PSG benchmark, demonstrating the efficacy of the existing PSG methods in an FL setting with controllable semantic heterogeneity of scene graphs. We also present the effectiveness of our benchmark by applying robust federated learning algorithms to data heterogeneity to show increased performance. Our code is available at https://github.com/Seung-B/FL-PSG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10436v1-abstract-full').style.display = 'none'; document.getElementById('2412.10436v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06769">arXiv:2412.06769</a> <span> [<a href="https://arxiv.org/pdf/2412.06769">pdf</a>, <a href="https://arxiv.org/format/2412.06769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Training Large Language Models to Reason in a Continuous Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Su%2C+D">DiJia Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiting Hu</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06769v2-abstract-short" style="display: inline;"> Large language models (LLMs) are restricted to reason in the "language space", where they typically express the reasoning process with a chain-of-thought (CoT) to solve a complex reasoning problem. However, we argue that language space may not always be optimal for reasoning. For example, most word tokens are primarily for textual coherence and not essential for reasoning, while some critical toke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06769v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06769v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06769v2-abstract-full" style="display: none;"> Large language models (LLMs) are restricted to reason in the "language space", where they typically express the reasoning process with a chain-of-thought (CoT) to solve a complex reasoning problem. However, we argue that language space may not always be optimal for reasoning. For example, most word tokens are primarily for textual coherence and not essential for reasoning, while some critical tokens require complex planning and pose huge challenges to LLMs. To explore the potential of LLM reasoning in an unrestricted latent space instead of using natural language, we introduce a new paradigm Coconut (Chain of Continuous Thought). We utilize the last hidden state of the LLM as a representation of the reasoning state (termed "continuous thought"). Rather than decoding this into a word token, we feed it back to the LLM as the subsequent input embedding directly in the continuous space. Experiments show that Coconut can effectively augment the LLM on several reasoning tasks. This novel latent reasoning paradigm leads to emergent advanced reasoning patterns: the continuous thought can encode multiple alternative next reasoning steps, allowing the model to perform a breadth-first search (BFS) to solve the problem, rather than prematurely committing to a single deterministic path like CoT. Coconut outperforms CoT in certain logical reasoning tasks that require substantial backtracking during planning, with fewer thinking tokens during inference. These findings demonstrate the promise of latent reasoning and offer valuable insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06769v2-abstract-full').style.display = 'none'; document.getElementById('2412.06769v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18000">arXiv:2411.18000</a> <span> [<a href="https://arxiv.org/pdf/2411.18000">pdf</a>, <a href="https://arxiv.org/format/2411.18000">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Visual Vulnerabilities via Multi-Loss Adversarial Search for Jailbreaking Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shuyang Hao</a>, <a href="/search/cs?searchtype=author&query=Hooi%2C+B">Bryan Hooi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zi Huang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yujun Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18000v2-abstract-short" style="display: inline;"> Despite inheriting security measures from underlying language models, Vision-Language Models (VLMs) may still be vulnerable to safety alignment issues. Through empirical analysis, we uncover two critical findings: scenario-matched images can significantly amplify harmful outputs, and contrary to common assumptions in gradient-based attacks, minimal loss values do not guarantee optimal attack effec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18000v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18000v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18000v2-abstract-full" style="display: none;"> Despite inheriting security measures from underlying language models, Vision-Language Models (VLMs) may still be vulnerable to safety alignment issues. Through empirical analysis, we uncover two critical findings: scenario-matched images can significantly amplify harmful outputs, and contrary to common assumptions in gradient-based attacks, minimal loss values do not guarantee optimal attack effectiveness. Building on these insights, we introduce MLAI (Multi-Loss Adversarial Images), a novel jailbreak framework that leverages scenario-aware image generation for semantic alignment, exploits flat minima theory for robust adversarial image selection, and employs multi-image collaborative attacks for enhanced effectiveness. Extensive experiments demonstrate MLAI's significant impact, achieving attack success rates of 77.75% on MiniGPT-4 and 82.80% on LLaVA-2, substantially outperforming existing methods by margins of 34.37% and 12.77% respectively. Furthermore, MLAI shows considerable transferability to commercial black-box VLMs, achieving up to 60.11% success rate. Our work reveals fundamental visual vulnerabilities in current VLMs safety mechanisms and underscores the need for stronger defenses. Warning: This paper contains potentially harmful example text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18000v2-abstract-full').style.display = 'none'; document.getElementById('2411.18000v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14733">arXiv:2411.14733</a> <span> [<a href="https://arxiv.org/pdf/2411.14733">pdf</a>, <a href="https://arxiv.org/format/2411.14733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> FLARE: FP-Less PTQ and Low-ENOB ADC Based AMS-PiM for Error-Resilient, Fast, and Efficient Transformer Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+D">Donghyeon Yi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seoyoung Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jongho Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junyoung Kim</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sohmyung Ha</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+I+J">Ik Joon Chang</a>, <a href="/search/cs?searchtype=author&query=Je%2C+M">Minkyu Je</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14733v1-abstract-short" style="display: inline;"> Encoder-based transformers, powered by self-attention layers, have revolutionized machine learning with their context-aware representations. However, their quadratic growth in computational and memory demands presents significant bottlenecks. Analog-Mixed-Signal Process-in-Memory (AMS-PiM) architectures address these challenges by enabling efficient on-chip processing. Traditionally, AMS-PiM relie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14733v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14733v1-abstract-full" style="display: none;"> Encoder-based transformers, powered by self-attention layers, have revolutionized machine learning with their context-aware representations. However, their quadratic growth in computational and memory demands presents significant bottlenecks. Analog-Mixed-Signal Process-in-Memory (AMS-PiM) architectures address these challenges by enabling efficient on-chip processing. Traditionally, AMS-PiM relies on Quantization-Aware Training (QAT), which is hardware-efficient but requires extensive retraining to adapt models to AMS-PiMs, making it increasingly impractical for transformer models. Post-Training Quantization (PTQ) mitigates this training overhead but introduces significant hardware inefficiencies. PTQ relies on dequantization-quantization (DQ-Q) processes, floating-point units (FPUs), and high-ENOB (Effective Number of Bits) analog-to-digital converters (ADCs). Particularly, High-ENOB ADCs scale exponentially in area and energy ($2^{ENOB}$), reduce sensing margins, and increase susceptibility to process, voltage, and temperature (PVT) variations, further compounding PTQ's challenges in AMS-PiM systems. To overcome these limitations, we propose RAP, an AMS-PiM architecture that eliminates DQ-Q processes, introduces FPU- and division-free nonlinear processing, and employs a low-ENOB-ADC-based sparse Matrix Vector multiplication technique. Using the proposed techniques, RAP improves error resiliency, area/energy efficiency, and computational speed while preserving numerical stability. Experimental results demonstrate that RAP outperforms state-of-the-art GPUs and conventional PiM architectures in energy efficiency, latency, and accuracy, making it a scalable solution for the efficient deployment of transformers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14733v1-abstract-full').style.display = 'none'; document.getElementById('2411.14733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02528">arXiv:2411.02528</a> <span> [<a href="https://arxiv.org/pdf/2411.02528">pdf</a>, <a href="https://arxiv.org/format/2411.02528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> What Goes Into a LM Acceptability Judgment? Rethinking the Impact of Frequency and Length </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tjuatja%2C+L">Lindia Tjuatja</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Linzen%2C+T">Tal Linzen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Sophie Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02528v1-abstract-short" style="display: inline;"> When comparing the linguistic capabilities of language models (LMs) with humans using LM probabilities, factors such as the length of the sequence and the unigram frequency of lexical items have a significant effect on LM probabilities in ways that humans are largely robust to. Prior works in comparing LM and human acceptability judgments treat these effects uniformly across models, making a stron… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02528v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02528v1-abstract-full" style="display: none;"> When comparing the linguistic capabilities of language models (LMs) with humans using LM probabilities, factors such as the length of the sequence and the unigram frequency of lexical items have a significant effect on LM probabilities in ways that humans are largely robust to. Prior works in comparing LM and human acceptability judgments treat these effects uniformly across models, making a strong assumption that models require the same degree of adjustment to control for length and unigram frequency effects. We propose MORCELA, a new linking theory between LM scores and acceptability judgments where the optimal level of adjustment for these effects is estimated from data via learned parameters for length and unigram frequency. We first show that MORCELA outperforms a commonly used linking theory for acceptability--SLOR (Pauls and Klein, 2012; Lau et al. 2017)--across two families of transformer LMs (Pythia and OPT). Furthermore, we demonstrate that the assumed degrees of adjustment in SLOR for length and unigram frequency overcorrect for these confounds, and that larger models require a lower relative degree of adjustment for unigram frequency, though a significant amount of adjustment is still necessary for all models. Finally, our subsequent analysis shows that larger LMs' lower susceptibility to frequency effects can be explained by an ability to better predict rarer words in context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02528v1-abstract-full').style.display = 'none'; document.getElementById('2411.02528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01623">arXiv:2411.01623</a> <span> [<a href="https://arxiv.org/pdf/2411.01623">pdf</a>, <a href="https://arxiv.org/format/2411.01623">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FilterNet: Harnessing Frequency Filters for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+K">Kun Yi</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+J">Jingru Fei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Hui He</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shufeng Hao</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+D">Defu Lian</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wei Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01623v2-abstract-short" style="display: inline;"> While numerous forecasters have been proposed using different network architectures, the Transformer-based models have state-of-the-art performance in time series forecasting. However, forecasters based on Transformers are still suffering from vulnerability to high-frequency signals, efficiency in computation, and bottleneck in full-spectrum utilization, which essentially are the cornerstones for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01623v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01623v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01623v2-abstract-full" style="display: none;"> While numerous forecasters have been proposed using different network architectures, the Transformer-based models have state-of-the-art performance in time series forecasting. However, forecasters based on Transformers are still suffering from vulnerability to high-frequency signals, efficiency in computation, and bottleneck in full-spectrum utilization, which essentially are the cornerstones for accurately predicting time series with thousands of points. In this paper, we explore a novel perspective of enlightening signal processing for deep time series forecasting. Inspired by the filtering process, we introduce one simple yet effective network, namely FilterNet, built upon our proposed learnable frequency filters to extract key informative temporal patterns by selectively passing or attenuating certain components of time series signals. Concretely, we propose two kinds of learnable filters in the FilterNet: (i) Plain shaping filter, that adopts a universal frequency kernel for signal filtering and temporal modeling; (ii) Contextual shaping filter, that utilizes filtered frequencies examined in terms of its compatibility with input signals for dependency learning. Equipped with the two filters, FilterNet can approximately surrogate the linear and attention mappings widely adopted in time series literature, while enjoying superb abilities in handling high-frequency noises and utilizing the whole frequency spectrum that is beneficial for forecasting. Finally, we conduct extensive experiments on eight time series forecasting benchmarks, and experimental results have demonstrated our superior performance in terms of both effectiveness and efficiency compared with state-of-the-art methods. Code is available at this repository: https://github.com/aikunyi/FilterNet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01623v2-abstract-full').style.display = 'none'; document.getElementById('2411.01623v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01494">arXiv:2411.01494</a> <span> [<a href="https://arxiv.org/pdf/2411.01494">pdf</a>, <a href="https://arxiv.org/format/2411.01494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Finding NeMo: Negative-mined Mosaic Augmentation for Referring Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ha%2C+S">Seongsu Ha</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C">Chaeyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Donghwa Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junho Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sangho Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonseok Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01494v1-abstract-short" style="display: inline;"> Referring Image Segmentation is a comprehensive task to segment an object referred by a textual query from an image. In nature, the level of difficulty in this task is affected by the existence of similar objects and the complexity of the referring expression. Recent RIS models still show a significant performance gap between easy and hard scenarios. We pose that the bottleneck exists in the data,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01494v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01494v1-abstract-full" style="display: none;"> Referring Image Segmentation is a comprehensive task to segment an object referred by a textual query from an image. In nature, the level of difficulty in this task is affected by the existence of similar objects and the complexity of the referring expression. Recent RIS models still show a significant performance gap between easy and hard scenarios. We pose that the bottleneck exists in the data, and propose a simple but powerful data augmentation method, Negative-mined Mosaic Augmentation (NeMo). This method augments a training image into a mosaic with three other negative images carefully curated by a pretrained multimodal alignment model, e.g., CLIP, to make the sample more challenging. We discover that it is critical to properly adjust the difficulty level, neither too ambiguous nor too trivial. The augmented training data encourages the RIS model to recognize subtle differences and relationships between similar visual entities and to concretely understand the whole expression to locate the right target better. Our approach shows consistent improvements on various datasets and models, verified by extensive experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01494v1-abstract-full').style.display = 'none'; document.getElementById('2411.01494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024. Project page: https://dddonghwa.github.io/NeMo/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18087">arXiv:2410.18087</a> <span> [<a href="https://arxiv.org/pdf/2410.18087">pdf</a>, <a href="https://arxiv.org/format/2410.18087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CUPID: A Real-Time Session-Based Reciprocal Recommendation System for a One-on-One Social Discovery Platform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+B">Beomsu Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangbum Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minchan Kim</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">Joonyoung Yi</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sungjoo Ha</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Suhyun Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Youngsoo Lee</a>, <a href="/search/cs?searchtype=author&query=Yeom%2C+G">Gihun Yeom</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Buru Chang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gihun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18087v1-abstract-short" style="display: inline;"> This study introduces CUPID, a novel approach to session-based reciprocal recommendation systems designed for a real-time one-on-one social discovery platform. In such platforms, low latency is critical to enhance user experiences. However, conventional session-based approaches struggle with high latency due to the demands of modeling sequential user behavior for each recommendation process. Addit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18087v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18087v1-abstract-full" style="display: none;"> This study introduces CUPID, a novel approach to session-based reciprocal recommendation systems designed for a real-time one-on-one social discovery platform. In such platforms, low latency is critical to enhance user experiences. However, conventional session-based approaches struggle with high latency due to the demands of modeling sequential user behavior for each recommendation process. Additionally, given the reciprocal nature of the platform, where users act as items for each other, training recommendation models on large-scale datasets is computationally prohibitive using conventional methods. To address these challenges, CUPID decouples the time-intensive user session modeling from the real-time user matching process to reduce inference time. Furthermore, CUPID employs a two-phase training strategy that separates the training of embedding and prediction layers, significantly reducing the computational burden by decreasing the number of sequential model inferences by several hundredfold. Extensive experiments on large-scale Azar datasets demonstrate CUPID's effectiveness in a real-world production environment. Notably, CUPID reduces response latency by more than 76% compared to non-asynchronous systems, while significantly improving user engagement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18087v1-abstract-full').style.display = 'none'; document.getElementById('2410.18087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 2nd International Workshop on User Understanding from Big Data Workshop (DMU2 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16257">arXiv:2410.16257</a> <span> [<a href="https://arxiv.org/pdf/2410.16257">pdf</a>, <a href="https://arxiv.org/format/2410.16257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Elucidating the design space of language models for image generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuantong Liu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianyang Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Rong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16257v1-abstract-short" style="display: inline;"> The success of autoregressive (AR) language models in text generation has inspired the computer vision community to adopt Large Language Models (LLMs) for image generation. However, considering the essential differences between text and image modalities, the design space of language models for image generation remains underexplored. We observe that image tokens exhibit greater randomness compared… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16257v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16257v1-abstract-full" style="display: none;"> The success of autoregressive (AR) language models in text generation has inspired the computer vision community to adopt Large Language Models (LLMs) for image generation. However, considering the essential differences between text and image modalities, the design space of language models for image generation remains underexplored. We observe that image tokens exhibit greater randomness compared to text tokens, which presents challenges when training with token prediction. Nevertheless, AR models demonstrate their potential by effectively learning patterns even from a seemingly suboptimal optimization problem. Our analysis also reveals that while all models successfully grasp the importance of local information in image generation, smaller models struggle to capture the global context. In contrast, larger models showcase improved capabilities in this area, helping to explain the performance gains achieved when scaling up model size. We further elucidate the design space of language models for vision generation, including tokenizer choice, model choice, model scalability, vocabulary design, and sampling strategy through extensive comparative experiments. Our work is the first to analyze the optimization behavior of language models in vision generation, and we believe it can inspire more effective designs when applying LMs to other domains. Finally, our elucidated language model for image generation, termed as ELM, achieves state-of-the-art performance on the ImageNet 256*256 benchmark. The code is available at https://github.com/Pepperlll/LMforImageGeneration.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16257v1-abstract-full').style.display = 'none'; document.getElementById('2410.16257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://pepper-lll.github.io/LMforImageGeneration/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14672">arXiv:2410.14672</a> <span> [<a href="https://arxiv.org/pdf/2410.14672">pdf</a>, <a href="https://arxiv.org/format/2410.14672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BiGR: Harnessing Binary Latent Codes for Image Generation and Improved Visual Representation Capabilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuantong Liu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shihao Zhao</a>, <a href="/search/cs?searchtype=author&query=Zi%2C+B">Bojia Zi</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Rong Xiao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">Kai Han</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K+K">Kwan-Yee K. Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14672v3-abstract-short" style="display: inline;"> We introduce BiGR, a novel conditional image generation model using compact binary latent codes for generative training, focusing on enhancing both generation and representation capabilities. BiGR is the first conditional generative model that unifies generation and discrimination within the same framework. BiGR features a binary tokenizer, a masked modeling mechanism, and a binary transcoder for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14672v3-abstract-full').style.display = 'inline'; document.getElementById('2410.14672v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14672v3-abstract-full" style="display: none;"> We introduce BiGR, a novel conditional image generation model using compact binary latent codes for generative training, focusing on enhancing both generation and representation capabilities. BiGR is the first conditional generative model that unifies generation and discrimination within the same framework. BiGR features a binary tokenizer, a masked modeling mechanism, and a binary transcoder for binary code prediction. Additionally, we introduce a novel entropy-ordered sampling method to enable efficient image generation. Extensive experiments validate BiGR's superior performance in generation quality, as measured by FID-50k, and representation capabilities, as evidenced by linear-probe accuracy. Moreover, BiGR showcases zero-shot generalization across various vision tasks, enabling applications such as image inpainting, outpainting, editing, interpolation, and enrichment, without the need for structural modifications. Our findings suggest that BiGR unifies generative and discriminative tasks effectively, paving the way for further advancements in the field. We further enable BiGR to perform text-to-image generation, showcasing its potential for broader applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14672v3-abstract-full').style.display = 'none'; document.getElementById('2410.14672v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated with additional T2I results; Project page: https://haoosz.github.io/BiGR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14168">arXiv:2410.14168</a> <span> [<a href="https://arxiv.org/pdf/2410.14168">pdf</a>, <a href="https://arxiv.org/format/2410.14168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Elements of disinformation theory: cyber engagement via increasing adversary information consumption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cuvelier%2C+T">Travis Cuvelier</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sean Ha</a>, <a href="/search/cs?searchtype=author&query=Morovitz%2C+M">Maretta Morovitz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14168v1-abstract-short" style="display: inline;"> We consider the case where an adversary is conducting a surveillance campaign against a networked control system (NCS), and take the perspective of a defender/control system operator who has successfully isolated the cyber intruder. To better understand the adversary's intentions and to drive up their operating costs, the defender directs the adversary towards a ``honeypot" that emulates a real co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14168v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14168v1-abstract-full" style="display: none;"> We consider the case where an adversary is conducting a surveillance campaign against a networked control system (NCS), and take the perspective of a defender/control system operator who has successfully isolated the cyber intruder. To better understand the adversary's intentions and to drive up their operating costs, the defender directs the adversary towards a ``honeypot" that emulates a real control system and without actual connections to a physical plant. We propose a strategy for adversary engagement within the ``honey" control system to increase the adversary's costs of information processing. We assume that, based on an understanding of the adversary's control theoretic goals, cyber threat intelligence (CTI) provides the defender knowledge of the adversary's preferences for information acquisition. We use this knowledge to spoof sensor readings to maximize the amount of information the adversary consumes while making it (information theoretically) difficult for the adversary to detect that they are being spoofed. We discuss the case of imperfect versus perfect threat intelligence and perform a numerical comparison. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14168v1-abstract-full').style.display = 'none'; document.getElementById('2410.14168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, to appear in the Proceedings of the 2024 IEEE MILCOM Workshop on Threat Informed Defense Technologies</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13057">arXiv:2410.13057</a> <span> [<a href="https://arxiv.org/pdf/2410.13057">pdf</a>, <a href="https://arxiv.org/format/2410.13057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ERAS: Evaluating the Robustness of Chinese NLP Models to Morphological Garden Path Errors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinchan Li</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Sophie Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13057v1-abstract-short" style="display: inline;"> In languages without orthographic word boundaries, NLP models perform word segmentation, either as an explicit preprocessing step or as an implicit step in an end-to-end computation. This paper shows that Chinese NLP models are vulnerable to morphological garden path errors: errors caused by a failure to resolve local word segmentation ambiguities using sentence-level morphosyntactic context. We p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13057v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13057v1-abstract-full" style="display: none;"> In languages without orthographic word boundaries, NLP models perform word segmentation, either as an explicit preprocessing step or as an implicit step in an end-to-end computation. This paper shows that Chinese NLP models are vulnerable to morphological garden path errors: errors caused by a failure to resolve local word segmentation ambiguities using sentence-level morphosyntactic context. We propose a benchmark, ERAS, that tests a model's vulnerability to morphological garden path errors by comparing its behavior on sentences with and without local segmentation ambiguities. Using ERAS, we show that word segmentation models make garden path errors on locally ambiguous sentences, but do not make equivalent errors on unambiguous sentences. We further show that sentiment analysis models with character-level tokenization make implicit garden path errors, even without an explicit word segmentation step in the pipeline. Our results indicate that models' segmentation of Chinese text often fails to account for morphosyntactic context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13057v1-abstract-full').style.display = 'none'; document.getElementById('2410.13057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review in ARR/NAACL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08530">arXiv:2410.08530</a> <span> [<a href="https://arxiv.org/pdf/2410.08530">pdf</a>, <a href="https://arxiv.org/format/2410.08530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Ego3DT: Tracking Every 3D Object in Ego-centric Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shengyu Hao</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhonghan Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Meiqi Sun</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wendi Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jieyang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yixian Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gaoang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08530v1-abstract-short" style="display: inline;"> The growing interest in embodied intelligence has brought ego-centric perspectives to contemporary research. One significant challenge within this realm is the accurate localization and tracking of objects in ego-centric videos, primarily due to the substantial variability in viewing angles. Addressing this issue, this paper introduces a novel zero-shot approach for the 3D reconstruction and track… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08530v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08530v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08530v1-abstract-full" style="display: none;"> The growing interest in embodied intelligence has brought ego-centric perspectives to contemporary research. One significant challenge within this realm is the accurate localization and tracking of objects in ego-centric videos, primarily due to the substantial variability in viewing angles. Addressing this issue, this paper introduces a novel zero-shot approach for the 3D reconstruction and tracking of all objects from the ego-centric video. We present Ego3DT, a novel framework that initially identifies and extracts detection and segmentation information of objects within the ego environment. Utilizing information from adjacent video frames, Ego3DT dynamically constructs a 3D scene of the ego view using a pre-trained 3D scene reconstruction model. Additionally, we have innovated a dynamic hierarchical association mechanism for creating stable 3D tracking trajectories of objects in ego-centric videos. Moreover, the efficacy of our approach is corroborated by extensive experiments on two newly compiled datasets, with 1.04x - 2.90x in HOTA, showcasing the robustness and accuracy of our method in diverse ego-centric scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08530v1-abstract-full').style.display = 'none'; document.getElementById('2410.08530v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Multimedia 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05002">arXiv:2410.05002</a> <span> [<a href="https://arxiv.org/pdf/2410.05002">pdf</a>, <a href="https://arxiv.org/format/2410.05002">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Social Network Datasets on Reddit Financial Discussion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zezhong Wang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Siyang Hao</a>, <a href="/search/cs?searchtype=author&query=Zwetsloot%2C+I+M">Inez Maria Zwetsloot</a>, <a href="/search/cs?searchtype=author&query=Trimborn%2C+S">Simon Trimborn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05002v1-abstract-short" style="display: inline;"> Stock markets are impacted by a large variety of factors including news and discussions among investors about investment opportunities. With the emergence of social media, new opportunities for having financial discussions arose. The market frenzy surrounding GameStop (GME) on the Reddit subreddit Wallstreetbets, caused financial discussion forums to receive widespread attention and it was establi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05002v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05002v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05002v1-abstract-full" style="display: none;"> Stock markets are impacted by a large variety of factors including news and discussions among investors about investment opportunities. With the emergence of social media, new opportunities for having financial discussions arose. The market frenzy surrounding GameStop (GME) on the Reddit subreddit Wallstreetbets, caused financial discussion forums to receive widespread attention and it was established that Wallstreetbets played a leading role in the stock market movements of GME. Here, we present a new data set for exploring the effect of social media discussion forums on the stock market. The dataset consists of posts published on various Reddit subreddits concerning the popular meme stocks GameStop (GME), American Multi-Cinema Entertainment Holdings (AMC), and BlackBerry (BB). We document the data collection and processing steps and show that the posts and comments about these meme stocks are related to their market movements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05002v1-abstract-full').style.display = 'none'; document.getElementById('2410.05002v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00398">arXiv:2410.00398</a> <span> [<a href="https://arxiv.org/pdf/2410.00398">pdf</a>, <a href="https://arxiv.org/format/2410.00398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CusConcept: Customized Visual Concept Decomposition with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhi Xu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">Kai Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00398v1-abstract-short" style="display: inline;"> Enabling generative models to decompose visual concepts from a single image is a complex and challenging problem. In this paper, we study a new and challenging task, customized concept decomposition, wherein the objective is to leverage diffusion models to decompose a single image and generate visual concepts from various perspectives. To address this challenge, we propose a two-stage framework, C… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00398v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00398v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00398v1-abstract-full" style="display: none;"> Enabling generative models to decompose visual concepts from a single image is a complex and challenging problem. In this paper, we study a new and challenging task, customized concept decomposition, wherein the objective is to leverage diffusion models to decompose a single image and generate visual concepts from various perspectives. To address this challenge, we propose a two-stage framework, CusConcept (short for Customized Visual Concept Decomposition), to extract customized visual concept embedding vectors that can be embedded into prompts for text-to-image generation. In the first stage, CusConcept employs a vocabulary-guided concept decomposition mechanism to build vocabularies along human-specified conceptual axes. The decomposed concepts are obtained by retrieving corresponding vocabularies and learning anchor weights. In the second stage, joint concept refinement is performed to enhance the fidelity and quality of generated images. We further curate an evaluation benchmark for assessing the performance of the open-world concept decomposition task. Our approach can effectively generate high-quality images of the decomposed concepts and produce related lexical predictions as secondary outcomes. Extensive qualitative and quantitative experiments demonstrate the effectiveness of CusConcept. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00398v1-abstract-full').style.display = 'none'; document.getElementById('2410.00398v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20514">arXiv:2409.20514</a> <span> [<a href="https://arxiv.org/pdf/2409.20514">pdf</a>, <a href="https://arxiv.org/format/2409.20514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Opt2Skill: Imitating Dynamically-feasible Whole-Body Trajectories for Versatile Humanoid Loco-Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fukang Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zhaoyuan Gu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yilin Cai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shijie Zhao</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+H">Hyunyoung Jung</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Danfei Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Ye Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20514v3-abstract-short" style="display: inline;"> Humanoid robots are designed to perform diverse loco-manipulation tasks. However, they face challenges due to their high-dimensional and unstable dynamics, as well as the complex contact-rich nature of the tasks. Model-based optimal control methods offer precise and systematic control but are limited by high computational complexity and accurate contact sensing. On the other hand, reinforcement le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20514v3-abstract-full').style.display = 'inline'; document.getElementById('2409.20514v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20514v3-abstract-full" style="display: none;"> Humanoid robots are designed to perform diverse loco-manipulation tasks. However, they face challenges due to their high-dimensional and unstable dynamics, as well as the complex contact-rich nature of the tasks. Model-based optimal control methods offer precise and systematic control but are limited by high computational complexity and accurate contact sensing. On the other hand, reinforcement learning (RL) provides robustness and handles high-dimensional spaces but suffers from inefficient learning, unnatural motion, and sim-to-real gaps. To address these challenges, we introduce Opt2Skill, an end-to-end pipeline that combines model-based trajectory optimization with RL to achieve robust whole-body loco-manipulation. We generate reference motions for the Digit humanoid robot using differential dynamic programming (DDP) and train RL policies to track these trajectories. Our results demonstrate that Opt2Skill outperforms pure RL methods in both training efficiency and task performance, with optimal trajectories that account for torque limits enhancing trajectory tracking. We successfully transfer our approach to real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20514v3-abstract-full').style.display = 'none'; document.getElementById('2409.20514v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14878">arXiv:2409.14878</a> <span> [<a href="https://arxiv.org/pdf/2409.14878">pdf</a>, <a href="https://arxiv.org/format/2409.14878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> InterMind: A Doctor-Patient-Family Interactive Depression Assessment System Empowered by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhiyuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jilong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sanwang Wang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shijie Hao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanrong Guo</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+R">Richang Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14878v1-abstract-short" style="display: inline;"> Depression poses significant challenges to patients and healthcare organizations, necessitating efficient assessment methods. Existing paradigms typically focus on a patient-doctor way that overlooks multi-role interactions, such as family involvement in the evaluation and caregiving process. Moreover, current automatic depression detection (ADD) methods usually model depression detection as a cla… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14878v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14878v1-abstract-full" style="display: none;"> Depression poses significant challenges to patients and healthcare organizations, necessitating efficient assessment methods. Existing paradigms typically focus on a patient-doctor way that overlooks multi-role interactions, such as family involvement in the evaluation and caregiving process. Moreover, current automatic depression detection (ADD) methods usually model depression detection as a classification or regression task, lacking interpretability for the decision-making process. To address these issues, we developed InterMind, a doctor-patient-family interactive depression assessment system empowered by large language models (LLMs). Our system enables patients and families to contribute descriptions, generates assistive diagnostic reports for doctors, and provides actionable insights, improving diagnostic precision and efficiency. To enhance LLMs' performance in psychological counseling and diagnostic interpretability, we integrate retrieval-augmented generation (RAG) and chain-of-thoughts (CoT) techniques for data augmentation, which mitigates the hallucination issue of LLMs in specific scenarios after instruction fine-tuning. Quantitative experiments and professional assessments by clinicians validate the effectiveness of our system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14878v1-abstract-full').style.display = 'none'; document.getElementById('2409.14878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14736">arXiv:2409.14736</a> <span> [<a href="https://arxiv.org/pdf/2409.14736">pdf</a>, <a href="https://arxiv.org/format/2409.14736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning Koopman Dynamics for Safe Legged Locomotion with Reinforcement Learning-based Controller </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jeonghwan Kim</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yunhai Han</a>, <a href="/search/cs?searchtype=author&query=Ravichandar%2C+H">Harish Ravichandar</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14736v1-abstract-short" style="display: inline;"> Learning-based algorithms have demonstrated impressive performance in agile locomotion of legged robots. However, learned policies are often complex and opaque due to the black-box nature of learning algorithms, which hinders predictability and precludes guarantees on performance or safety. In this work, we develop a novel safe navigation framework that combines Koopman operators and model-predict… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14736v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14736v1-abstract-full" style="display: none;"> Learning-based algorithms have demonstrated impressive performance in agile locomotion of legged robots. However, learned policies are often complex and opaque due to the black-box nature of learning algorithms, which hinders predictability and precludes guarantees on performance or safety. In this work, we develop a novel safe navigation framework that combines Koopman operators and model-predictive control (MPC) frameworks. Our method adopts Koopman operator theory to learn the linear evolution of dynamics of the underlying locomotion policy, which can be effectively learned with Dynamic Mode Decomposition (DMD). Given that our learned model is linear, we can readily leverage the standard MPC algorithm. Our framework is easy to implement with less prior knowledge because it does not require access to the underlying dynamical systems or control-theoretic techniques. We demonstrate that the learned linear dynamics can better predict the trajectories of legged robots than baselines. In addition, we showcase that the proposed navigation framework can achieve better safety with less collisions in challenging and dense environments with narrow passages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14736v1-abstract-full').style.display = 'none'; document.getElementById('2409.14736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14296">arXiv:2409.14296</a> <span> [<a href="https://arxiv.org/pdf/2409.14296">pdf</a>, <a href="https://arxiv.org/format/2409.14296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> HM3D-OVON: A Dataset and Benchmark for Open-Vocabulary Object Goal Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yokoyama%2C+N">Naoki Yokoyama</a>, <a href="/search/cs?searchtype=author&query=Ramrakhya%2C+R">Ram Ramrakhya</a>, <a href="/search/cs?searchtype=author&query=Das%2C+A">Abhishek Das</a>, <a href="/search/cs?searchtype=author&query=Batra%2C+D">Dhruv Batra</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14296v1-abstract-short" style="display: inline;"> We present the Habitat-Matterport 3D Open Vocabulary Object Goal Navigation dataset (HM3D-OVON), a large-scale benchmark that broadens the scope and semantic range of prior Object Goal Navigation (ObjectNav) benchmarks. Leveraging the HM3DSem dataset, HM3D-OVON incorporates over 15k annotated instances of household objects across 379 distinct categories, derived from photo-realistic 3D scans of re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14296v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14296v1-abstract-full" style="display: none;"> We present the Habitat-Matterport 3D Open Vocabulary Object Goal Navigation dataset (HM3D-OVON), a large-scale benchmark that broadens the scope and semantic range of prior Object Goal Navigation (ObjectNav) benchmarks. Leveraging the HM3DSem dataset, HM3D-OVON incorporates over 15k annotated instances of household objects across 379 distinct categories, derived from photo-realistic 3D scans of real-world environments. In contrast to earlier ObjectNav datasets, which limit goal objects to a predefined set of 6-20 categories, HM3D-OVON facilitates the training and evaluation of models with an open-set of goals defined through free-form language at test-time. Through this open-vocabulary formulation, HM3D-OVON encourages progress towards learning visuo-semantic navigation behaviors that are capable of searching for any object specified by text in an open-vocabulary manner. Additionally, we systematically evaluate and compare several different types of approaches on HM3D-OVON. We find that HM3D-OVON can be used to train an open-vocabulary ObjectNav agent that achieves both higher performance and is more robust to localization and actuation noise than the state-of-the-art ObjectNav approach. We hope that our benchmark and baseline results will drive interest in developing embodied agents that can navigate real-world spaces to find household objects specified through free-form language, taking a step towards more flexible and human-like semantic visual navigation. Code and videos available at: naoki.io/ovon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14296v1-abstract-full').style.display = 'none'; document.getElementById('2409.14296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11532">arXiv:2409.11532</a> <span> [<a href="https://arxiv.org/pdf/2409.11532">pdf</a>, <a href="https://arxiv.org/format/2409.11532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Enhancing the Reliability of LiDAR Point Cloud Sampling: A Colorization and Super-Resolution Approach Based on LiDAR-Generated Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sier Ha</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Honghao Du</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xianjia Yu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jian Song</a>, <a href="/search/cs?searchtype=author&query=Westerlund%2C+T">Tomi Westerlund</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11532v1-abstract-short" style="display: inline;"> In recent years, Light Detection and Ranging (LiDAR) technology, a critical sensor in robotics and autonomous systems, has seen significant advancements. These improvements include enhanced resolution of point clouds and the capability to provide 360掳 low-resolution images. These images encode various data such as depth, reflectivity, and near-infrared light within the pixels. However, an excessiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11532v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11532v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11532v1-abstract-full" style="display: none;"> In recent years, Light Detection and Ranging (LiDAR) technology, a critical sensor in robotics and autonomous systems, has seen significant advancements. These improvements include enhanced resolution of point clouds and the capability to provide 360掳 low-resolution images. These images encode various data such as depth, reflectivity, and near-infrared light within the pixels. However, an excessive density of points and conventional point cloud sampling can be counterproductive, particularly in applications such as LiDAR odometry, where misleading points and degraded geometry information may induce drift errors. Currently, extensive research efforts are being directed towards leveraging LiDAR-generated images to improve situational awareness. This paper presents a comprehensive review of current deep learning (DL) techniques, including colorization and super-resolution, which are traditionally utilized in conventional computer vision tasks. These techniques are applied to LiDAR-generated images and are analyzed qualitatively. Based on this analysis, we have developed a novel approach that selectively integrates the most suited colorization and super-resolution methods with LiDAR imagery to sample reliable points from the LiDAR point cloud. This approach aims to not only improve the accuracy of point cloud registration but also avoid mismatching caused by lacking geometry information, thereby augmenting the utility and precision of LiDAR systems in practical applications. In our evaluation, the proposed approach demonstrates superior performance compared to our previous work, achieving lower translation and rotation errors with a reduced number of points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11532v1-abstract-full').style.display = 'none'; document.getElementById('2409.11532v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09473">arXiv:2409.09473</a> <span> [<a href="https://arxiv.org/pdf/2409.09473">pdf</a>, <a href="https://arxiv.org/format/2409.09473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning to enhance multi-legged robot on rugged landscapes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Juntao He</a>, <a href="/search/cs?searchtype=author&query=Chong%2C+B">Baxi Chong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhaochen Xu</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a>, <a href="/search/cs?searchtype=author&query=Goldman%2C+D+I">Daniel I. Goldman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09473v1-abstract-short" style="display: inline;"> Navigating rugged landscapes poses significant challenges for legged locomotion. Multi-legged robots (those with 6 and greater) offer a promising solution for such terrains, largely due to their inherent high static stability, resulting from a low center of mass and wide base of support. Such systems require minimal effort to maintain balance. Recent studies have shown that a linear controller, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09473v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09473v1-abstract-full" style="display: none;"> Navigating rugged landscapes poses significant challenges for legged locomotion. Multi-legged robots (those with 6 and greater) offer a promising solution for such terrains, largely due to their inherent high static stability, resulting from a low center of mass and wide base of support. Such systems require minimal effort to maintain balance. Recent studies have shown that a linear controller, which modulates the vertical body undulation of a multi-legged robot in response to shifts in terrain roughness, can ensure reliable mobility on challenging terrains. However, the potential of a learning-based control framework that adjusts multiple parameters to address terrain heterogeneity remains underexplored. We posit that the development of an experimentally validated physics-based simulator for this robot can rapidly advance capabilities by allowing wide parameter space exploration. Here we develop a MuJoCo-based simulator tailored to this robotic platform and use the simulation to develop a reinforcement learning-based control framework that dynamically adjusts horizontal and vertical body undulation, and limb stepping in real-time. Our approach improves robot performance in simulation, laboratory experiments, and outdoor tests. Notably, our real-world experiments reveal that the learning-based controller achieves a 30\% to 50\% increase in speed compared to a linear controller, which only modulates vertical body waves. We hypothesize that the superior performance of the learning-based controller arises from its ability to adjust multiple parameters simultaneously, including limb stepping, horizontal body wave, and vertical body wave. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09473v1-abstract-full').style.display = 'none'; document.getElementById('2409.09473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08020">arXiv:2409.08020</a> <span> [<a href="https://arxiv.org/pdf/2409.08020">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Network Anomaly Traffic Detection via Multi-view Feature Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Song Hao</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+W">Wentao Fu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanze Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chengxiang Jin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiajun Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shanqing Yu</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+Q">Qi Xuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08020v1-abstract-short" style="display: inline;"> Traditional anomalous traffic detection methods are based on single-view analysis, which has obvious limitations in dealing with complex attacks and encrypted communications. In this regard, we propose a Multi-view Feature Fusion (MuFF) method for network anomaly traffic detection. MuFF models the temporal and interactive relationships of packets in network traffic based on the temporal and intera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08020v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08020v1-abstract-full" style="display: none;"> Traditional anomalous traffic detection methods are based on single-view analysis, which has obvious limitations in dealing with complex attacks and encrypted communications. In this regard, we propose a Multi-view Feature Fusion (MuFF) method for network anomaly traffic detection. MuFF models the temporal and interactive relationships of packets in network traffic based on the temporal and interactive viewpoints respectively. It learns temporal and interactive features. These features are then fused from different perspectives for anomaly traffic detection. Extensive experiments on six real traffic datasets show that MuFF has excellent performance in network anomalous traffic detection, which makes up for the shortcomings of detection under a single perspective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08020v1-abstract-full').style.display = 'none'; document.getElementById('2409.08020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in Chinese language, Accepted by Journal of Command and Control</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05260">arXiv:2409.05260</a> <span> [<a href="https://arxiv.org/pdf/2409.05260">pdf</a>, <a href="https://arxiv.org/format/2409.05260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scalable Frame Sampling for Video Classification: A Semi-Optimal Policy Approach with Reduced Search Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junho Lee</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jeongwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+S+W">Seung Woo Ko</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Seongsu Ha</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonseok Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05260v1-abstract-short" style="display: inline;"> Given a video with $T$ frames, frame sampling is a task to select $N \ll T$ frames, so as to maximize the performance of a fixed video classifier. Not just brute-force search, but most existing methods suffer from its vast search space of $\binom{T}{N}$, especially when $N$ gets large. To address this challenge, we introduce a novel perspective of reducing the search space from $O(T^N)$ to $O(T)$.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05260v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05260v1-abstract-full" style="display: none;"> Given a video with $T$ frames, frame sampling is a task to select $N \ll T$ frames, so as to maximize the performance of a fixed video classifier. Not just brute-force search, but most existing methods suffer from its vast search space of $\binom{T}{N}$, especially when $N$ gets large. To address this challenge, we introduce a novel perspective of reducing the search space from $O(T^N)$ to $O(T)$. Instead of exploring the entire $O(T^N)$ space, our proposed semi-optimal policy selects the top $N$ frames based on the independently estimated value of each frame using per-frame confidence, significantly reducing the computational complexity. We verify that our semi-optimal policy can efficiently approximate the optimal policy, particularly under practical settings. Additionally, through extensive experiments on various datasets and model architectures, we demonstrate that learning our semi-optimal policy ensures stable and high performance regardless of the size of $N$ and $T$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05260v1-abstract-full').style.display = 'none'; document.getElementById('2409.05260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03745">arXiv:2409.03745</a> <span> [<a href="https://arxiv.org/pdf/2409.03745">pdf</a>, <a href="https://arxiv.org/format/2409.03745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ArtiFade: Learning to Generate High-quality Subject from Blemished Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuya Yang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yukang Cao</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K+K">Kwan-Yee K. Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03745v1-abstract-short" style="display: inline;"> Subject-driven text-to-image generation has witnessed remarkable advancements in its ability to learn and capture characteristics of a subject using only a limited number of images. However, existing methods commonly rely on high-quality images for training and may struggle to generate reasonable images when the input images are blemished by artifacts. This is primarily attributed to the inadequat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03745v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03745v1-abstract-full" style="display: none;"> Subject-driven text-to-image generation has witnessed remarkable advancements in its ability to learn and capture characteristics of a subject using only a limited number of images. However, existing methods commonly rely on high-quality images for training and may struggle to generate reasonable images when the input images are blemished by artifacts. This is primarily attributed to the inadequate capability of current techniques in distinguishing subject-related features from disruptive artifacts. In this paper, we introduce ArtiFade to tackle this issue and successfully generate high-quality artifact-free images from blemished datasets. Specifically, ArtiFade exploits fine-tuning of a pre-trained text-to-image model, aiming to remove artifacts. The elimination of artifacts is achieved by utilizing a specialized dataset that encompasses both unblemished images and their corresponding blemished counterparts during fine-tuning. ArtiFade also ensures the preservation of the original generative capabilities inherent within the diffusion model, thereby enhancing the overall performance of subject-driven methods in generating high-quality and artifact-free images. We further devise evaluation benchmarks tailored for this task. Through extensive qualitative and quantitative experiments, we demonstrate the generalizability of ArtiFade in effective artifact removal under both in-distribution and out-of-distribution scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03745v1-abstract-full').style.display = 'none'; document.getElementById('2409.03745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11318">arXiv:2408.11318</a> <span> [<a href="https://arxiv.org/pdf/2408.11318">pdf</a>, <a href="https://arxiv.org/ps/2408.11318">ps</a>, <a href="https://arxiv.org/format/2408.11318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TWLV-I: Analysis and Insights from Holistic Evaluation on Video Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hyeongmin Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jin-Young Kim</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+K">Kyungjune Baek</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jihwan Kim</a>, <a href="/search/cs?searchtype=author&query=Go%2C+H">Hyojun Go</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Seongsu Ha</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Seokjin Han</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Jiho Jang</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+R">Raehyuk Jung</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Daewoo Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+G">GeunOh Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">JongMok Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jongseok Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junwan Kim</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+S">Soonwoo Kwon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jangwon Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seungjoon Park</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+M">Minjoon Seo</a>, <a href="/search/cs?searchtype=author&query=Suh%2C+J">Jay Suh</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">Jaehyuk Yi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+A">Aiden Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11318v2-abstract-short" style="display: inline;"> In this work, we discuss evaluating video foundation models in a fair and robust manner. Unlike language or image foundation models, many video foundation models are evaluated with differing parameters (such as sampling rate, number of frames, pretraining steps, etc.), making fair and robust comparisons challenging. Therefore, we present a carefully designed evaluation framework for measuring two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11318v2-abstract-full').style.display = 'inline'; document.getElementById('2408.11318v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11318v2-abstract-full" style="display: none;"> In this work, we discuss evaluating video foundation models in a fair and robust manner. Unlike language or image foundation models, many video foundation models are evaluated with differing parameters (such as sampling rate, number of frames, pretraining steps, etc.), making fair and robust comparisons challenging. Therefore, we present a carefully designed evaluation framework for measuring two core capabilities of video comprehension: appearance and motion understanding. Our findings reveal that existing video foundation models, whether text-supervised like UMT or InternVideo2, or self-supervised like V-JEPA, exhibit limitations in at least one of these capabilities. As an alternative, we introduce TWLV-I, a new video foundation model that constructs robust visual representations for both motion- and appearance-based videos. Based on the average top-1 accuracy of linear probing on five action recognition benchmarks, pretrained only on publicly accessible datasets, our model shows a 4.6%p improvement compared to V-JEPA (ViT-L) and a 7.7%p improvement compared to UMT (ViT-L). Even when compared to much larger models, our model demonstrates a 7.2%p improvement compared to DFN (ViT-H), a 2.7%p improvement compared to V-JEPA (ViT-H) and a 2.8%p improvement compared to InternVideo2 (ViT-g). We provide embedding vectors obtained by TWLV-I from videos of several commonly used video benchmarks, along with evaluation source code that can directly utilize these embeddings. The code is available at https://github.com/twelvelabs-io/video-embeddings-evaluation-framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11318v2-abstract-full').style.display = 'none'; document.getElementById('2408.11318v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages; Twelve Labs Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10934">arXiv:2408.10934</a> <span> [<a href="https://arxiv.org/pdf/2408.10934">pdf</a>, <a href="https://arxiv.org/format/2408.10934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SDI-Net: Toward Sufficient Dual-View Interaction for Low-light Stereo Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+L">Linlin Hu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+A">Ao Sun</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shijie Hao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+R">Richang Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10934v1-abstract-short" style="display: inline;"> Currently, most low-light image enhancement methods only consider information from a single view, neglecting the correlation between cross-view information. Therefore, the enhancement results produced by these methods are often unsatisfactory. In this context, there have been efforts to develop methods specifically for low-light stereo image enhancement. These methods take into account the cross-v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10934v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10934v1-abstract-full" style="display: none;"> Currently, most low-light image enhancement methods only consider information from a single view, neglecting the correlation between cross-view information. Therefore, the enhancement results produced by these methods are often unsatisfactory. In this context, there have been efforts to develop methods specifically for low-light stereo image enhancement. These methods take into account the cross-view disparities and enable interaction between the left and right views, leading to improved performance. However, these methods still do not fully exploit the interaction between left and right view information. To address this issue, we propose a model called Toward Sufficient Dual-View Interaction for Low-light Stereo Image Enhancement (SDI-Net). The backbone structure of SDI-Net is two encoder-decoder pairs, which are used to learn the mapping function from low-light images to normal-light images. Among the encoders and the decoders, we design a module named Cross-View Sufficient Interaction Module (CSIM), aiming to fully exploit the correlations between the binocular views via the attention mechanism. The quantitative and visual results on public datasets validate the superiority of our method over other related methods. Ablation studies also demonstrate the effectiveness of the key elements in our model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10934v1-abstract-full').style.display = 'none'; document.getElementById('2408.10934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07009">arXiv:2408.07009</a> <span> [<a href="https://arxiv.org/pdf/2408.07009">pdf</a>, <a href="https://arxiv.org/format/2408.07009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Imagen 3 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Imagen-Team-Google"> Imagen-Team-Google</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Baldridge%2C+J">Jason Baldridge</a>, <a href="/search/cs?searchtype=author&query=Bauer%2C+J">Jakob Bauer</a>, <a href="/search/cs?searchtype=author&query=Bhutani%2C+M">Mukul Bhutani</a>, <a href="/search/cs?searchtype=author&query=Brichtova%2C+N">Nicole Brichtova</a>, <a href="/search/cs?searchtype=author&query=Bunner%2C+A">Andrew Bunner</a>, <a href="/search/cs?searchtype=author&query=Castrejon%2C+L">Lluis Castrejon</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kelvin Chan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yichang Chen</a>, <a href="/search/cs?searchtype=author&query=Dieleman%2C+S">Sander Dieleman</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yuqing Du</a>, <a href="/search/cs?searchtype=author&query=Eaton-Rosen%2C+Z">Zach Eaton-Rosen</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Hongliang Fei</a>, <a href="/search/cs?searchtype=author&query=de+Freitas%2C+N">Nando de Freitas</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yilin Gao</a>, <a href="/search/cs?searchtype=author&query=Gladchenko%2C+E">Evgeny Gladchenko</a>, <a href="/search/cs?searchtype=author&query=Colmenarejo%2C+S+G">Sergio G贸mez Colmenarejo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Mandy Guo</a>, <a href="/search/cs?searchtype=author&query=Haig%2C+A">Alex Haig</a>, <a href="/search/cs?searchtype=author&query=Hawkins%2C+W">Will Hawkins</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huilian Huang</a>, <a href="/search/cs?searchtype=author&query=Igwe%2C+T+P">Tobenna Peter Igwe</a>, <a href="/search/cs?searchtype=author&query=Kaplanis%2C+C">Christos Kaplanis</a> , et al. (237 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07009v3-abstract-short" style="display: inline;"> We introduce Imagen 3, a latent diffusion model that generates high quality images from text prompts. We describe our quality and responsibility evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at the time of evaluation. In addition, we discuss issues around safety and representation, as well as methods we used to minimize the potential harm of our models. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07009v3-abstract-full" style="display: none;"> We introduce Imagen 3, a latent diffusion model that generates high quality images from text prompts. We describe our quality and responsibility evaluations. Imagen 3 is preferred over other state-of-the-art (SOTA) models at the time of evaluation. In addition, we discuss issues around safety and representation, as well as methods we used to minimize the potential harm of our models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07009v3-abstract-full').style.display = 'none'; document.getElementById('2408.07009v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06811">arXiv:2408.06811</a> <span> [<a href="https://arxiv.org/pdf/2408.06811">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Oracle Bone Script Similiar Character Screening Approach Based on Simsiam Contrastive Learning and Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+X">Xinying Weng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifan Li</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shuaidong Hao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jialiang Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06811v1-abstract-short" style="display: inline;"> This project proposes a new method that uses fuzzy comprehensive evaluation method to integrate ResNet-50 self-supervised and RepVGG supervised learning. The source image dataset HWOBC oracle is taken as input, the target image is selected, and finally the most similar image is output in turn without any manual intervention. The same feature encoding method is not used for images of different moda… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06811v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06811v1-abstract-full" style="display: none;"> This project proposes a new method that uses fuzzy comprehensive evaluation method to integrate ResNet-50 self-supervised and RepVGG supervised learning. The source image dataset HWOBC oracle is taken as input, the target image is selected, and finally the most similar image is output in turn without any manual intervention. The same feature encoding method is not used for images of different modalities. Before the model training, the image data is preprocessed, and the image is enhanced by random rotation processing, self-square graph equalization theory algorithm, and gamma transform, which effectively enhances the key feature learning. Finally, the fuzzy comprehensive evaluation method is used to combine the results of supervised training and unsupervised training, which can better solve the "most similar" problem that is difficult to quantify. At present, there are many unknown oracle-bone inscriptions waiting for us to crack. Contacting with the glyphs can provide new ideas for cracking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06811v1-abstract-full').style.display = 'none'; document.getElementById('2408.06811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07077">arXiv:2407.07077</a> <span> [<a href="https://arxiv.org/pdf/2407.07077">pdf</a>, <a href="https://arxiv.org/format/2407.07077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ConceptExpress: Harnessing Diffusion Models for Single-image Unsupervised Concept Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">Kai Han</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Z">Zhengyao Lv</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shihao Zhao</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K+K">Kwan-Yee K. Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07077v1-abstract-short" style="display: inline;"> While personalized text-to-image generation has enabled the learning of a single concept from multiple images, a more practical yet challenging scenario involves learning multiple concepts within a single image. However, existing works tackling this scenario heavily rely on extensive human annotations. In this paper, we introduce a novel task named Unsupervised Concept Extraction (UCE) that consid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07077v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07077v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07077v1-abstract-full" style="display: none;"> While personalized text-to-image generation has enabled the learning of a single concept from multiple images, a more practical yet challenging scenario involves learning multiple concepts within a single image. However, existing works tackling this scenario heavily rely on extensive human annotations. In this paper, we introduce a novel task named Unsupervised Concept Extraction (UCE) that considers an unsupervised setting without any human knowledge of the concepts. Given an image that contains multiple concepts, the task aims to extract and recreate individual concepts solely relying on the existing knowledge from pretrained diffusion models. To achieve this, we present ConceptExpress that tackles UCE by unleashing the inherent capabilities of pretrained diffusion models in two aspects. Specifically, a concept localization approach automatically locates and disentangles salient concepts by leveraging spatial correspondence from diffusion self-attention; and based on the lookup association between a concept and a conceptual token, a concept-wise optimization process learns discriminative tokens that represent each individual concept. Finally, we establish an evaluation protocol tailored for the UCE task. Extensive experiments demonstrate that ConceptExpress is a promising solution to the UCE task. Our code and data are available at: https://github.com/haoosz/ConceptExpress <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07077v1-abstract-full').style.display = 'none'; document.getElementById('2407.07077v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024, Project page: https://haoosz.github.io/ConceptExpress/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06780">arXiv:2407.06780</a> <span> [<a href="https://arxiv.org/pdf/2407.06780">pdf</a>, <a href="https://arxiv.org/format/2407.06780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoLA: Conditional Dropout and Language-driven Robust Dual-modal Salient Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shuang Hao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+C">Chunlin Zhong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">He Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06780v1-abstract-short" style="display: inline;"> The depth/thermal information is beneficial for detecting salient object with conventional RGB images. However, in dual-modal salient object detection (SOD) model, the robustness against noisy inputs and modality missing is crucial but rarely studied. To tackle this problem, we introduce \textbf{Co}nditional Dropout and \textbf{LA}nguage-driven(\textbf{CoLA}) framework comprising two core componen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06780v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06780v1-abstract-full" style="display: none;"> The depth/thermal information is beneficial for detecting salient object with conventional RGB images. However, in dual-modal salient object detection (SOD) model, the robustness against noisy inputs and modality missing is crucial but rarely studied. To tackle this problem, we introduce \textbf{Co}nditional Dropout and \textbf{LA}nguage-driven(\textbf{CoLA}) framework comprising two core components. 1) Language-driven Quality Assessment (LQA): Leveraging a pretrained vision-language model with a prompt learner, the LQA recalibrates image contributions without requiring additional quality annotations. This approach effectively mitigates the impact of noisy inputs. 2) Conditional Dropout (CD): A learning method to strengthen the model's adaptability in scenarios with missing modalities, while preserving its performance under complete modalities. The CD serves as a plug-in training scheme that treats modality-missing as conditions, strengthening the overall robustness of various dual-modal SOD models. Extensive experiments demonstrate that the proposed method outperforms state-of-the-art dual-modal SOD models, under both modality-complete and modality-missing conditions. We will release source code upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06780v1-abstract-full').style.display = 'none'; document.getElementById('2407.06780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04213">arXiv:2407.04213</a> <span> [<a href="https://arxiv.org/pdf/2407.04213">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Pathfinder: Exploring Path Diversity for Assessing Internet Censorship Inconsistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaoqin Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guannan Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Lin Jin</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shuai Hao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haining Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04213v1-abstract-short" style="display: inline;"> Internet censorship is typically enforced by authorities to achieve information control for a certain group of Internet users. So far existing censorship studies have primarily focused on country-level characterization because (1) in many cases, censorship is enabled by governments with nationwide policies and (2) it is usually hard to control how the probing packets are routed to trigger censorsh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04213v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04213v1-abstract-full" style="display: none;"> Internet censorship is typically enforced by authorities to achieve information control for a certain group of Internet users. So far existing censorship studies have primarily focused on country-level characterization because (1) in many cases, censorship is enabled by governments with nationwide policies and (2) it is usually hard to control how the probing packets are routed to trigger censorship in different networks inside a country. However, the deployment and implementation of censorship could be highly diverse at the ISP level. In this paper, we investigate Internet censorship from a different perspective by scrutinizing the diverse censorship deployment inside a country. Specifically, by leveraging an end-to-end measurement framework, we deploy multiple geo-distributed back-end control servers to explore various paths from one single vantage point. The generated traffic with the same domain but different control servers' IPs could be forced to traverse different transit networks, thereby being examined by different censorship devices if present. Through our large-scale experiments and in-depth investigation, we reveal that the diversity of Internet censorship caused by different routing paths inside a country is prevalent, implying that (1) the implementations of centralized censorship are commonly incomplete or flawed and (2) decentralized censorship is also common. Moreover, we identify that different hosting platforms also result in inconsistent censorship activities due to different peering relationships with the ISPs in a country. Finally, we present extensive case studies in detail to illustrate the configurations that lead to censorship inconsistency and explore the causes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04213v1-abstract-full').style.display = 'none'; document.getElementById('2407.04213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15999">arXiv:2406.15999</a> <span> [<a href="https://arxiv.org/pdf/2406.15999">pdf</a>, <a href="https://arxiv.org/format/2406.15999">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3643738">10.1145/3643738 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SmartAxe: Detecting Cross-Chain Vulnerabilities in Bridge Smart Contracts via Fine-Grained Static Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zeqin Liao</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yuhong Nan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Henglong Liang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Sicheng Hao</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+J">Juan Zhai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajing Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15999v1-abstract-short" style="display: inline;"> With the increasing popularity of blockchain, different blockchain platforms coexist in the ecosystem (e.g., Ethereum, BNB, EOSIO, etc.), which prompts the high demand for cross-chain communication. Cross-chain bridge is a specific type of decentralized application for asset exchange across different blockchain platforms. Securing the smart contracts of cross-chain bridges is in urgent need, as th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15999v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15999v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15999v1-abstract-full" style="display: none;"> With the increasing popularity of blockchain, different blockchain platforms coexist in the ecosystem (e.g., Ethereum, BNB, EOSIO, etc.), which prompts the high demand for cross-chain communication. Cross-chain bridge is a specific type of decentralized application for asset exchange across different blockchain platforms. Securing the smart contracts of cross-chain bridges is in urgent need, as there are a number of recent security incidents with heavy financial losses caused by vulnerabilities in bridge smart contracts, as we call them Cross-Chain Vulnerabilities (CCVs). However, automatically identifying CCVs in smart contracts poses several unique challenges. Particularly, it is non-trivial to (1) identify application-specific access control constraints needed for cross-bridge asset exchange, and (2) identify inconsistent cross-chain semantics between the two sides of the bridge. In this paper, we propose SmartAxe, a new framework to identify vulnerabilities in cross-chain bridge smart contracts. Particularly, to locate vulnerable functions that have access control incompleteness, SmartAxe models the heterogeneous implementations of access control and finds necessary security checks in smart contracts through probabilistic pattern inference. Besides, SmartAxe constructs cross-chain control-flow graph (xCFG) and data-flow graph (xDFG), which help to find semantic inconsistency during cross-chain data communication. To evaluate SmartAxe, we collect and label a dataset of 88 CCVs from real-attacks cross-chain bridge contracts. Evaluation results show that SmartAxe achieves a precision of 84.95% and a recall of 89.77%. In addition, SmartAxe successfully identifies 232 new/unknown CCVs from 129 real-world cross-chain bridge applications (i.e., from 1,703 smart contracts). These identified CCVs affect a total amount of digital assets worth 1,885,250 USD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15999v1-abstract-full').style.display = 'none'; document.getElementById('2406.15999v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> The ACM International Conference on the Foundations of Software Engineering 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15988">arXiv:2406.15988</a> <span> [<a href="https://arxiv.org/pdf/2406.15988">pdf</a>, <a href="https://arxiv.org/format/2406.15988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3597926.3598111">10.1145/3597926.3598111 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SmartState: Detecting State-Reverting Vulnerabilities in Smart Contracts via Fine-Grained State-Dependency Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zeqin Liao</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Sicheng Hao</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yuhong Nan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15988v1-abstract-short" style="display: inline;"> Smart contracts written in Solidity are widely used in different blockchain platforms such as Ethereum, TRON and BNB Chain. One of the unique designs in Solidity smart contracts is its state-reverting mechanism for error handling and access control. Unfortunately, a number of recent security incidents showed that adversaries also utilize this mechanism to manipulate critical states of smart contra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15988v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15988v1-abstract-full" style="display: none;"> Smart contracts written in Solidity are widely used in different blockchain platforms such as Ethereum, TRON and BNB Chain. One of the unique designs in Solidity smart contracts is its state-reverting mechanism for error handling and access control. Unfortunately, a number of recent security incidents showed that adversaries also utilize this mechanism to manipulate critical states of smart contracts, and hence, bring security consequences such as illegal profit-gain and Deny-of-Service (DoS). In this paper, we call such vulnerabilities as the State-reverting Vulnerability (SRV). Automatically identifying SRVs poses unique challenges, as it requires an in-depth analysis and understanding of the state-dependency relations in smart contracts. This paper presents SmartState, a new framework for detecting state-reverting vulnerability in Solidity smart contracts via fine-grained state-dependency analysis. SmartState integrates a set of novel mechanisms to ensure its effectiveness. Particularly, Smart-State extracts state dependencies from both contract bytecode and historical transactions. Both of them are critical for inferring dependencies related to SRVs. Further, SmartState models the generic patterns of SRVs (i.e., profit-gain and DoS) as SRV indicators, and hence effectively identify SRVs based on the constructed state-dependency graph. To evaluate SmartState, we manually annotated a ground-truth dataset which contains 91 SRVs in the real world. Evaluation results showed that SmartState achieves a precision of 87.23% and a recall of 89.13%. In addition, SmartState successfully identifies 406 new SRVs from 47,351 real-world smart contracts. 11 of these SRVs are from popular smart contracts with high transaction amounts (i.e., top 2000). In total, our reported SRVs affect a total amount of digital assets worth 428,600 USD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15988v1-abstract-full').style.display = 'none'; document.getElementById('2406.15988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ISSTA 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09455">arXiv:2406.09455</a> <span> [<a href="https://arxiv.org/pdf/2406.09455">pdf</a>, <a href="https://arxiv.org/format/2406.09455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Pandora: Towards General World Model with Natural Language Actions and Video States </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jiannan Xiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guangyi Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yi Gu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qiyue Gao</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yuting Ning</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Y">Yuheng Zha</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zeyu Feng</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+T">Tianhua Tao</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yemin Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengzhong Liu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+E+P">Eric P. Xing</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiting Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09455v1-abstract-short" style="display: inline;"> World models simulate future states of the world in response to different actions. They facilitate interactive content creation and provides a foundation for grounded, long-horizon reasoning. Current foundation models do not fully meet the capabilities of general world models: large language models (LLMs) are constrained by their reliance on language modality and their limited understanding of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09455v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09455v1-abstract-full" style="display: none;"> World models simulate future states of the world in response to different actions. They facilitate interactive content creation and provides a foundation for grounded, long-horizon reasoning. Current foundation models do not fully meet the capabilities of general world models: large language models (LLMs) are constrained by their reliance on language modality and their limited understanding of the physical world, while video models lack interactive action control over the world simulations. This paper makes a step towards building a general world model by introducing Pandora, a hybrid autoregressive-diffusion model that simulates world states by generating videos and allows real-time control with free-text actions. Pandora achieves domain generality, video consistency, and controllability through large-scale pretraining and instruction tuning. Crucially, Pandora bypasses the cost of training-from-scratch by integrating a pretrained LLM (7B) and a pretrained video model, requiring only additional lightweight finetuning. We illustrate extensive outputs by Pandora across diverse domains (indoor/outdoor, natural/urban, human/robot, 2D/3D, etc.). The results indicate great potential of building stronger general world models with larger-scale training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09455v1-abstract-full').style.display = 'none'; document.getElementById('2406.09455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://world-model.maitrix.org/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06615">arXiv:2406.06615</a> <span> [<a href="https://arxiv.org/pdf/2406.06615">pdf</a>, <a href="https://arxiv.org/format/2406.06615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Language Guided Skill Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rho%2C+S">Seungeun Rho</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+L">Laura Smith</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X+B">Xue Bin Peng</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sehoon Ha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06615v1-abstract-short" style="display: inline;"> Skill discovery methods enable agents to learn diverse emergent behaviors without explicit rewards. To make learned skills useful for unknown downstream tasks, obtaining a semantically diverse repertoire of skills is essential. While some approaches introduce a discriminator to distinguish skills and others aim to increase state coverage, no existing work directly addresses the "semantic diversity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06615v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06615v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06615v1-abstract-full" style="display: none;"> Skill discovery methods enable agents to learn diverse emergent behaviors without explicit rewards. To make learned skills useful for unknown downstream tasks, obtaining a semantically diverse repertoire of skills is essential. While some approaches introduce a discriminator to distinguish skills and others aim to increase state coverage, no existing work directly addresses the "semantic diversity" of skills. We hypothesize that leveraging the semantic knowledge of large language models (LLMs) can lead us to improve semantic diversity of resulting behaviors. In this sense, we introduce Language Guided Skill Discovery (LGSD), a skill discovery framework that aims to directly maximize the semantic diversity between skills. LGSD takes user prompts as input and outputs a set of semantically distinctive skills. The prompts serve as a means to constrain the search space into a semantically desired subspace, and the generated LLM outputs guide the agent to visit semantically diverse states within the subspace. We demonstrate that LGSD enables legged robots to visit different user-intended areas on a plane by simply changing the prompt. Furthermore, we show that language guidance aids in discovering more diverse skills compared to five existing skill discovery methods in robot-arm manipulation environments. Lastly, LGSD provides a simple way of utilizing learned skills via natural language. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06615v1-abstract-full').style.display = 'none'; document.getElementById('2406.06615v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05673">arXiv:2406.05673</a> <span> [<a href="https://arxiv.org/pdf/2406.05673">pdf</a>, <a href="https://arxiv.org/format/2406.05673">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Flow of Reasoning:Training LLMs for Divergent Problem Solving with Minimal Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fangxu Yu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lai Jiang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Haoqiang Kang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Lianhui Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05673v3-abstract-short" style="display: inline;"> The ability to generate diverse solutions to a given problem is a hallmark of human creativity. This divergent reasoning is also crucial for machines, enhancing their robustness and enabling them to assist humans in many applications such as scientific discovery. However, existing approaches to multi-step reasoning with large language models (LLMs) have mostly focused only on reasoning accuracy, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05673v3-abstract-full').style.display = 'inline'; document.getElementById('2406.05673v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05673v3-abstract-full" style="display: none;"> The ability to generate diverse solutions to a given problem is a hallmark of human creativity. This divergent reasoning is also crucial for machines, enhancing their robustness and enabling them to assist humans in many applications such as scientific discovery. However, existing approaches to multi-step reasoning with large language models (LLMs) have mostly focused only on reasoning accuracy, without further discovering more diverse valid solutions. For example, supervised fine-tuning can improve LLM reasoning quality, but requires extensive supervised data to capture the full range of possible solutions. Reinforcement learning aims to find limited highest-reward solutions while neglecting the solution diversity. To fill this gap, we propose Flow of Reasoning (FoR), an efficient diversity-seeking LLM finetuning method aimed at improving reasoning quality and diversity with minimal data. FoR formulates multi-step LLM reasoning as a Markovian flow on a DAG-structured reasoning graph. This formulation allows us to incorporate and adapt principled GFlowNet approaches, for finetuning LLMs to sample diverse reasoning paths with probabilities proportional to the (unnormalized) reward of target problems. Extensive experiments show that, with limited training examples (e.g., 15 examples), FoR enables the discovery of diverse, creative, high-quality solutions, greatly outperforming a wide range of existing inference and training methods across five challenging puzzle-solving tasks, including BlocksWorld (embodied reasoning), Game24 (math puzzle solving), Rubik's Cube (spatial reasoning), 1D-ARC (abstraction reasoning), and PrOntoQA (logical reasoning). Code is available at https://github.com/Yu-Fangxu/FoR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05673v3-abstract-full').style.display = 'none'; document.getElementById('2406.05673v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04983">arXiv:2406.04983</a> <span> [<a href="https://arxiv.org/pdf/2406.04983">pdf</a>, <a href="https://arxiv.org/format/2406.04983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CityCraft: A Real Crafter for 3D City Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jie Deng</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junsheng Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhonghan Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qixuan Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mingyan Gao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jianshu Guo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shengyu Hao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenhao Hu</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jenq-Neng Hwang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gaoang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04983v1-abstract-short" style="display: inline;"> City scene generation has gained significant attention in autonomous driving, smart city development, and traffic simulation. It helps enhance infrastructure planning and monitoring solutions. Existing methods have employed a two-stage process involving city layout generation, typically using Variational Autoencoders (VAEs), Generative Adversarial Networks (GANs), or Transformers, followed by neur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04983v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04983v1-abstract-full" style="display: none;"> City scene generation has gained significant attention in autonomous driving, smart city development, and traffic simulation. It helps enhance infrastructure planning and monitoring solutions. Existing methods have employed a two-stage process involving city layout generation, typically using Variational Autoencoders (VAEs), Generative Adversarial Networks (GANs), or Transformers, followed by neural rendering. These techniques often exhibit limited diversity and noticeable artifacts in the rendered city scenes. The rendered scenes lack variety, resembling the training images, resulting in monotonous styles. Additionally, these methods lack planning capabilities, leading to less realistic generated scenes. In this paper, we introduce CityCraft, an innovative framework designed to enhance both the diversity and quality of urban scene generation. Our approach integrates three key stages: initially, a diffusion transformer (DiT) model is deployed to generate diverse and controllable 2D city layouts. Subsequently, a Large Language Model(LLM) is utilized to strategically make land-use plans within these layouts based on user prompts and language guidelines. Based on the generated layout and city plan, we utilize the asset retrieval module and Blender for precise asset placement and scene construction. Furthermore, we contribute two new datasets to the field: 1)CityCraft-OSM dataset including 2D semantic layouts of urban areas, corresponding satellite images, and detailed annotations. 2) CityCraft-Buildings dataset, featuring thousands of diverse, high-quality 3D building assets. CityCraft achieves state-of-the-art performance in generating realistic 3D cities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04983v1-abstract-full').style.display = 'none'; document.getElementById('2406.04983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 9 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hao%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Hao%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository