Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 297 results for author: <span class="mathjax">Jin, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Jin%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Jin, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Jin%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Jin, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jin%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09662">arXiv:2502.09662</a> <span> [<a href="https://arxiv.org/pdf/2502.09662">pdf</a>, <a href="https://arxiv.org/format/2502.09662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Cervical Cancer Screening via Large-scale Pretraining and Test-Time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huangjing Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanning Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Li Ding</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jun Hou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Runsheng Liu</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Z">Zhizhong Chai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huijuan Shi</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yinling Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changzhong Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+A">Anjia Han</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+R+C+K">Ronald Cheong Kin Chan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09662v1-abstract-short" style="display: inline;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and general… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09662v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09662v1-abstract-full" style="display: none;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and generalizable screening systems. To develop and validate Smart-CCS, we first curated a large-scale, multi-center dataset named CCS-127K, which comprises a total of 127,471 cervical cytology whole-slide images collected from 48 medical centers. By leveraging large-scale self-supervised pretraining, our CCS models are equipped with strong generalization capability, potentially generalizing across diverse scenarios. Then, we incorporated test-time adaptation to specifically optimize the trained CCS model for complex clinical settings, which adapts and refines predictions, improving real-world applicability. We conducted large-scale system evaluation among various cohorts. In retrospective cohorts, Smart-CCS achieved an overall area under the curve (AUC) value of 0.965 and sensitivity of 0.913 for cancer screening on 11 internal test datasets. In external testing, system performance maintained high at 0.950 AUC across 6 independent test datasets. In prospective cohorts, our Smart-CCS achieved AUCs of 0.947, 0.924, and 0.986 in three prospective centers, respectively. Moreover, the system demonstrated superior sensitivity in diagnosing cervical cancer, confirming the accuracy of our cancer screening results by using histology findings for validation. Interpretability analysis with cell and slide predictions further indicated that the system's decision-making aligns with clinical practice. Smart-CCS represents a significant advancement in cancer screening across diverse clinical contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'none'; document.getElementById('2502.09662v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07640">arXiv:2502.07640</a> <span> [<a href="https://arxiv.org/pdf/2502.07640">pdf</a>, <a href="https://arxiv.org/format/2502.07640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Goedel-Prover: A Frontier Model for Open-Source Automated Theorem Proving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yong Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shange Tang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+B">Bohan Lyu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiayun Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhou Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaiyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mengzhou Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Sanjeev Arora</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07640v2-abstract-short" style="display: inline;"> We introduce Goedel-Prover, an open-source large language model (LLM) that achieves the state-of-the-art (SOTA) performance in automated formal proof generation for mathematical problems. The key challenge in this field is the scarcity of formalized math statements and proofs, which we tackle in the following ways. We train statement formalizers to translate the natural language math problems from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07640v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07640v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07640v2-abstract-full" style="display: none;"> We introduce Goedel-Prover, an open-source large language model (LLM) that achieves the state-of-the-art (SOTA) performance in automated formal proof generation for mathematical problems. The key challenge in this field is the scarcity of formalized math statements and proofs, which we tackle in the following ways. We train statement formalizers to translate the natural language math problems from Numina into formal language (Lean 4), creating a dataset of 1.64 million formal statements. LLMs are used to check that the formal statements accurately preserve the content of the original natural language problems. We then iteratively build a large dataset of formal proofs by training a series of provers. Each prover succeeds in proving many statements that the previous ones could not, and these new proofs are added to the training set for the next prover. Despite using only supervised fine-tuning, our final prover significantly outperforms the previous best open-source model, DeepSeek-Prover-V1.5, which employs reinforcement learning. On the miniF2F benchmark, our model achieves a success rate of 57.6% (Pass@32), surpassing DeepSeek-Prover-V1.5 by 7.6%. On PutnamBench, Goedel-Prover successfully solves 7 problems (Pass@512), ranking first on the leaderboard. Furthermore, it generates 29.7K formal proofs for Lean Workbook problems, nearly doubling the 15.7K produced by earlier works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07640v2-abstract-full').style.display = 'none'; document.getElementById('2502.07640v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06814">arXiv:2502.06814</a> <span> [<a href="https://arxiv.org/pdf/2502.06814">pdf</a>, <a href="https://arxiv.org/format/2502.06814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chen Jin</a>, <a href="/search/cs?searchtype=author&query=Tanno%2C+R">Ryutaro Tanno</a>, <a href="/search/cs?searchtype=author&query=Saseendran%2C+A">Amrutha Saseendran</a>, <a href="/search/cs?searchtype=author&query=Diethe%2C+T">Tom Diethe</a>, <a href="/search/cs?searchtype=author&query=Teare%2C+P">Philip Teare</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06814v1-abstract-short" style="display: inline;"> We introduce Lavender, a simple supervised fine-tuning (SFT) method that boosts the performance of advanced vision-language models (VLMs) by leveraging state-of-the-art image generation models such as Stable Diffusion. Specifically, Lavender aligns the text-vision attention in the VLM transformer with the equivalent used by Stable Diffusion during SFT, instead of adapting separate encoders. This a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06814v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06814v1-abstract-full" style="display: none;"> We introduce Lavender, a simple supervised fine-tuning (SFT) method that boosts the performance of advanced vision-language models (VLMs) by leveraging state-of-the-art image generation models such as Stable Diffusion. Specifically, Lavender aligns the text-vision attention in the VLM transformer with the equivalent used by Stable Diffusion during SFT, instead of adapting separate encoders. This alignment enriches the model's visual understanding and significantly boosts performance across in- and out-of-distribution tasks. Lavender requires just 0.13 million training examples, 2.5% of typical large-scale SFT datasets, and fine-tunes on standard hardware (8 GPUs) in a single day. It consistently improves state-of-the-art open-source multimodal LLMs (e.g., Llama-3.2-11B, MiniCPM-Llama3-v2.5), achieving up to 30% gains and a 68% boost on challenging out-of-distribution medical QA tasks. By efficiently transferring the visual expertise of image generators with minimal supervision, Lavender offers a scalable solution for more accurate vision-language systems. All code, training data, and models will be shared at https://astrazeneca.github.io/vlm/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06814v1-abstract-full').style.display = 'none'; document.getElementById('2502.06814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at https://astrazeneca.github.io/vlm/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06453">arXiv:2502.06453</a> <span> [<a href="https://arxiv.org/pdf/2502.06453">pdf</a>, <a href="https://arxiv.org/format/2502.06453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MATH-Perturb: Benchmarking LLMs' Math Reasoning Abilities against Hard Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaixuan Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiacheng Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zihao Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiang Ji</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jiawei Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenzhe Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yingqing Guo</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tianle Cai</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runzhe Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yue Wu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shange Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangsibo Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06453v2-abstract-short" style="display: inline;"> Large language models have demonstrated impressive performance on challenging mathematical reasoning tasks, which has triggered the discussion of whether the performance is achieved by true reasoning capability or memorization. To investigate this question, prior work has constructed mathematical benchmarks when questions undergo simple perturbations -- modifications that still preserve the underl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06453v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06453v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06453v2-abstract-full" style="display: none;"> Large language models have demonstrated impressive performance on challenging mathematical reasoning tasks, which has triggered the discussion of whether the performance is achieved by true reasoning capability or memorization. To investigate this question, prior work has constructed mathematical benchmarks when questions undergo simple perturbations -- modifications that still preserve the underlying reasoning patterns of the solutions. However, no work has explored hard perturbations, which fundamentally change the nature of the problem so that the original solution steps do not apply. To bridge the gap, we construct MATH-P-Simple and MATH-P-Hard via simple perturbation and hard perturbation, respectively. Each consists of 279 perturbed math problems derived from level-5 (hardest) problems in the MATH dataset (Hendrycksmath et. al., 2021). We observe significant performance drops on MATH-P-Hard across various models, including o1-mini (-16.49%) and gemini-2.0-flash-thinking (-12.9%). We also raise concerns about a novel form of memorization where models blindly apply learned problem-solving skills without assessing their applicability to modified contexts. This issue is amplified when using original problems for in-context learning. We call for research efforts to address this challenge, which is critical for developing more robust and reliable reasoning models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06453v2-abstract-full').style.display = 'none'; document.getElementById('2502.06453v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v2: fix bugs in Fig. 1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00896">arXiv:2502.00896</a> <span> [<a href="https://arxiv.org/pdf/2502.00896">pdf</a>, <a href="https://arxiv.org/format/2502.00896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LoR-VP: Low-Rank Visual Prompting for Efficient Vision Model Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Can Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mingyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxiao He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&query=Che%2C+T">Tong Che</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D+N">Dimitris N. Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00896v2-abstract-short" style="display: inline;"> Visual prompting has gained popularity as a method for adapting pre-trained models to specific tasks, particularly in the realm of parameter-efficient tuning. However, existing visual prompting techniques often pad the prompt parameters around the image, limiting the interaction between the visual prompts and the original image to a small set of patches while neglecting the inductive bias present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00896v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00896v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00896v2-abstract-full" style="display: none;"> Visual prompting has gained popularity as a method for adapting pre-trained models to specific tasks, particularly in the realm of parameter-efficient tuning. However, existing visual prompting techniques often pad the prompt parameters around the image, limiting the interaction between the visual prompts and the original image to a small set of patches while neglecting the inductive bias present in shared information across different patches. In this study, we conduct a thorough preliminary investigation to identify and address these limitations. We propose a novel visual prompt design, introducing Low-Rank matrix multiplication for Visual Prompting (LoR-VP), which enables shared and patch-specific information across rows and columns of image pixels. Extensive experiments across seven network architectures and four datasets demonstrate significant improvements in both performance and efficiency compared to state-of-the-art visual prompting methods, achieving up to 6 times faster training times, utilizing 18 times fewer visual prompt parameters, and delivering a 3.1% improvement in performance. The code is available as https://github.com/jincan333/LoR-VP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00896v2-abstract-full').style.display = 'none'; document.getElementById('2502.00896v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00709">arXiv:2502.00709</a> <span> [<a href="https://arxiv.org/pdf/2502.00709">pdf</a>, <a href="https://arxiv.org/format/2502.00709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RankFlow: A Multi-Role Collaborative Reranking Workflow Utilizing Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Can Jin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hongwu Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Anxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiahui Zhao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xi Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kuangzheng Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shuya Feng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+K">Kai Zhong</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Caiwen Ding</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D+N">Dimitris N. Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00709v2-abstract-short" style="display: inline;"> In an Information Retrieval (IR) system, reranking plays a critical role by sorting candidate passages according to their relevance to a specific query. This process demands a nuanced understanding of the variations among passages linked to the query. In this work, we introduce RankFlow, a multi-role reranking workflow that leverages the capabilities of Large Language Models (LLMs) and role specia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00709v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00709v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00709v2-abstract-full" style="display: none;"> In an Information Retrieval (IR) system, reranking plays a critical role by sorting candidate passages according to their relevance to a specific query. This process demands a nuanced understanding of the variations among passages linked to the query. In this work, we introduce RankFlow, a multi-role reranking workflow that leverages the capabilities of Large Language Models (LLMs) and role specializations to improve reranking performance. RankFlow enlists LLMs to fulfill four distinct roles: the query Rewriter, the pseudo Answerer, the passage Summarizer, and the Reranker. This orchestrated approach enables RankFlow to: (1) accurately interpret queries, (2) draw upon LLMs' extensive pre-existing knowledge, (3) distill passages into concise versions, and (4) assess passages in a comprehensive manner, resulting in notably better reranking results. Our experimental results reveal that RankFlow outperforms existing leading approaches on widely recognized IR benchmarks, such as TREC-DL, BEIR, and NovelEval. Additionally, we investigate the individual contributions of each role in RankFlow. Code is available at https://github.com/jincan333/RankFlow. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00709v2-abstract-full').style.display = 'none'; document.getElementById('2502.00709v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00674">arXiv:2502.00674</a> <span> [<a href="https://arxiv.org/pdf/2502.00674">pdf</a>, <a href="https://arxiv.org/format/2502.00674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Mixture-of-Agents: Is Mixing Different Large Language Models Beneficial? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenzhe Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yong Lin</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mengzhou Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00674v1-abstract-short" style="display: inline;"> Ensembling outputs from diverse sources is a straightforward yet effective approach to boost performance. Mixture-of-Agents (MoA) is one such popular ensemble method that aggregates outputs from multiple different Large Language Models (LLMs). This paper raises the question in the context of language models: is mixing different LLMs truly beneficial? We propose Self-MoA -- an ensemble method that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00674v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00674v1-abstract-full" style="display: none;"> Ensembling outputs from diverse sources is a straightforward yet effective approach to boost performance. Mixture-of-Agents (MoA) is one such popular ensemble method that aggregates outputs from multiple different Large Language Models (LLMs). This paper raises the question in the context of language models: is mixing different LLMs truly beneficial? We propose Self-MoA -- an ensemble method that aggregates outputs from only the single top-performing LLM. Our extensive experiments reveal that, surprisingly, Self-MoA outperforms standard MoA that mixes different LLMs in a large number of scenarios: Self-MoA achieves $6.6\%$ improvement over MoA on the AlpacaEval 2.0 benchmark, and an average of $3.8\%$ improvement across various benchmarks, including MMLU, CRUX, and MATH. Applying Self-MoA to one of the top-ranking models in AlpacaEval 2.0 directly achieves the new state-of-the-art performance on the leaderboard. To understand the effectiveness of Self-MoA, we systematically investigate the trade-off between diversity and quality of outputs under various MoA settings. We confirm that the MoA performance is rather sensitive to the quality, and mixing different LLMs often lowers the average quality of the models. To complement the study, we identify the scenarios where mixing different LLMs could be helpful. This paper further introduces a sequential version of Self-MoA, that is capable of aggregating a large number of LLM outputs on-the-fly over multiple rounds, and is as effective as aggregating all outputs at once. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00674v1-abstract-full').style.display = 'none'; document.getElementById('2502.00674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00253">arXiv:2502.00253</a> <span> [<a href="https://arxiv.org/pdf/2502.00253">pdf</a>, <a href="https://arxiv.org/format/2502.00253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Patch Triplet Similarity Purification for Guided Real-World Low-Dose CT Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+J">Junhao Long</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fengwei Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Juncheng Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Baoping Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chao Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+C">Changliang Zou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00253v1-abstract-short" style="display: inline;"> Image denoising of low-dose computed tomography (LDCT) is an important problem for clinical diagnosis with reduced radiation exposure. Previous methods are mostly trained with pairs of synthetic or misaligned LDCT and normal-dose CT (NDCT) images. However, trained with synthetic noise or misaligned LDCT/NDCT image pairs, the denoising networks would suffer from blurry structure or motion artifacts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00253v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00253v1-abstract-full" style="display: none;"> Image denoising of low-dose computed tomography (LDCT) is an important problem for clinical diagnosis with reduced radiation exposure. Previous methods are mostly trained with pairs of synthetic or misaligned LDCT and normal-dose CT (NDCT) images. However, trained with synthetic noise or misaligned LDCT/NDCT image pairs, the denoising networks would suffer from blurry structure or motion artifacts. Since non-contrast CT (NCCT) images share the content characteristics to the corresponding NDCT images in a three-phase scan, they can potentially provide useful information for real-world LDCT image denoising. To exploit this aspect, in this paper, we propose to incorporate clean NCCT images as useful guidance for the learning of real-world LDCT image denoising networks. To alleviate the issue of spatial misalignment in training data, we design a new Patch Triplet Similarity Purification (PTSP) strategy to select highly similar patch (instead of image) triplets of LDCT, NDCT, and NCCT images for network training. Furthermore, we modify two image denoising transformers of SwinIR and HAT to accommodate the NCCT image guidance, by replacing vanilla self-attention with cross-attention. On our collected clinical dataset, the modified transformers trained with the data selected by our PTSP strategy show better performance than 15 comparison methods on real-world LDCT image denoising. Ablation studies validate the effectiveness of our NCCT image guidance and PTSP strategy. We will publicly release our data and code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00253v1-abstract-full').style.display = 'none'; document.getElementById('2502.00253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12326">arXiv:2501.12326</a> <span> [<a href="https://arxiv.org/pdf/2501.12326">pdf</a>, <a href="https://arxiv.org/format/2501.12326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> UI-TARS: Pioneering Automated GUI Interaction with Native Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yujia Qin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yining Ye</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junjie Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoming Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shihao Liang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+S">Shizuo Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junda Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunxin Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shijue Huang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+W">Wanjun Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kuanye Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiale Yang</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Y">Yu Miao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Woyu Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Longxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xu Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingyu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiaojun Xiao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+K">Kai Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuang Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yaowei Zheng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chaolin Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12326v1-abstract-short" style="display: inline;"> This paper introduces UI-TARS, a native GUI agent model that solely perceives the screenshots as input and performs human-like interactions (e.g., keyboard and mouse operations). Unlike prevailing agent frameworks that depend on heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts and workflows, UI-TARS is an end-to-end model that outperforms these sophisticated frameworks.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12326v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12326v1-abstract-full" style="display: none;"> This paper introduces UI-TARS, a native GUI agent model that solely perceives the screenshots as input and performs human-like interactions (e.g., keyboard and mouse operations). Unlike prevailing agent frameworks that depend on heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts and workflows, UI-TARS is an end-to-end model that outperforms these sophisticated frameworks. Experiments demonstrate its superior performance: UI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating perception, grounding, and GUI task execution. Notably, in the OSWorld benchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15 steps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld, UI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several key innovations: (1) Enhanced Perception: leveraging a large-scale dataset of GUI screenshots for context-aware understanding of UI elements and precise captioning; (2) Unified Action Modeling, which standardizes actions into a unified space across platforms and achieves precise grounding and interaction through large-scale action traces; (3) System-2 Reasoning, which incorporates deliberate reasoning into multi-step decision making, involving multiple reasoning patterns such as task decomposition, reflection thinking, milestone recognition, etc. (4) Iterative Training with Reflective Online Traces, which addresses the data bottleneck by automatically collecting, filtering, and reflectively refining new interaction traces on hundreds of virtual machines. Through iterative training and reflection tuning, UI-TARS continuously learns from its mistakes and adapts to unforeseen situations with minimal human intervention. We also analyze the evolution path of GUI agents to guide the further development of this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12326v1-abstract-full').style.display = 'none'; document.getElementById('2501.12326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19723">arXiv:2412.19723</a> <span> [<a href="https://arxiv.org/pdf/2412.19723">pdf</a>, <a href="https://arxiv.org/format/2412.19723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> OS-Genesis: Automating GUI Agent Trajectory Construction via Reverse Task Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qiushi Sun</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K">Kanzhi Cheng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zichen Ding</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yian Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fangzhi Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenyu Wu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+C">Chengyou Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liheng Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhoumianze Liu</a>, <a href="/search/cs?searchtype=author&query=Kao%2C+B">Ben Kao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junxian He</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19723v1-abstract-short" style="display: inline;"> Graphical User Interface (GUI) agents powered by Vision-Language Models (VLMs) have demonstrated human-like computer control capability. Despite their utility in advancing digital automation, a critical bottleneck persists: collecting high-quality trajectory data for training. Common practices for collecting such data rely on human supervision or synthetic data generation through executing pre-def… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19723v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19723v1-abstract-full" style="display: none;"> Graphical User Interface (GUI) agents powered by Vision-Language Models (VLMs) have demonstrated human-like computer control capability. Despite their utility in advancing digital automation, a critical bottleneck persists: collecting high-quality trajectory data for training. Common practices for collecting such data rely on human supervision or synthetic data generation through executing pre-defined tasks, which are either resource-intensive or unable to guarantee data quality. Moreover, these methods suffer from limited data diversity and significant gaps between synthetic data and real-world environments. To address these challenges, we propose OS-Genesis, a novel GUI data synthesis pipeline that reverses the conventional trajectory collection process. Instead of relying on pre-defined tasks, OS-Genesis enables agents first to perceive environments and perform step-wise interactions, then retrospectively derive high-quality tasks to enable trajectory-level exploration. A trajectory reward model is then employed to ensure the quality of the generated trajectories. We demonstrate that training GUI agents with OS-Genesis significantly improves their performance on highly challenging online benchmarks. In-depth analysis further validates OS-Genesis's efficiency and its superior data quality and diversity compared to existing synthesis methods. Our codes, data, and checkpoints are available at \href{https://qiushisun.github.io/OS-Genesis-Home/}{OS-Genesis Homepage}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19723v1-abstract-full').style.display = 'none'; document.getElementById('2412.19723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18710">arXiv:2412.18710</a> <span> [<a href="https://arxiv.org/pdf/2412.18710">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Simi-SFX: A similarity-based conditioning method for controllable sound effect synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunyi Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Craig Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18710v1-abstract-short" style="display: inline;"> Generating sound effects with controllable variations is a challenging task, traditionally addressed using sophisticated physical models that require in-depth knowledge of signal processing parameters and algorithms. In the era of generative and large language models, text has emerged as a common, human-interpretable interface for controlling sound synthesis. However, the discrete and qualitative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18710v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18710v1-abstract-full" style="display: none;"> Generating sound effects with controllable variations is a challenging task, traditionally addressed using sophisticated physical models that require in-depth knowledge of signal processing parameters and algorithms. In the era of generative and large language models, text has emerged as a common, human-interpretable interface for controlling sound synthesis. However, the discrete and qualitative nature of language tokens makes it difficult to capture subtle timbral variations across different sounds. In this research, we propose a novel similarity-based conditioning method for sound synthesis, leveraging differentiable digital signal processing (DDSP). This approach combines the use of latent space for learning and controlling audio timbre with an intuitive guiding vector, normalized within the range [0,1], to encode categorical acoustic information. By utilizing pre-trained audio representation models, our method achieves expressive and fine-grained timbre control. To benchmark our approach, we introduce two sound effect datasets--Footstep-set and Impact-set--designed to evaluate both controllability and sound quality. Regression analysis demonstrates that the proposed similarity score effectively controls timbre variations and enables creative applications such as timbre interpolation between discrete classes. Our work provides a robust and versatile framework for sound effect synthesis, bridging the gap between traditional signal processing and modern machine learning techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18710v1-abstract-full').style.display = 'none'; document.getElementById('2412.18710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17162">arXiv:2412.17162</a> <span> [<a href="https://arxiv.org/pdf/2412.17162">pdf</a>, <a href="https://arxiv.org/format/2412.17162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Diffusion Modeling: A Practical Handbook </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zihan Ding</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17162v1-abstract-short" style="display: inline;"> This handbook offers a unified perspective on diffusion models, encompassing diffusion probabilistic models, score-based generative models, consistency models, rectified flow, and related methods. By standardizing notations and aligning them with code implementations, it aims to bridge the "paper-to-code" gap and facilitate robust implementations and fair comparisons. The content encompasses the f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17162v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17162v1-abstract-full" style="display: none;"> This handbook offers a unified perspective on diffusion models, encompassing diffusion probabilistic models, score-based generative models, consistency models, rectified flow, and related methods. By standardizing notations and aligning them with code implementations, it aims to bridge the "paper-to-code" gap and facilitate robust implementations and fair comparisons. The content encompasses the fundamentals of diffusion models, the pre-training process, and various post-training methods. Post-training techniques include model distillation and reward-based fine-tuning. Designed as a practical guide, it emphasizes clarity and usability over theoretical depth, focusing on widely adopted approaches in generative modeling with diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17162v1-abstract-full').style.display = 'none'; document.getElementById('2412.17162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16516">arXiv:2412.16516</a> <span> [<a href="https://arxiv.org/pdf/2412.16516">pdf</a>, <a href="https://arxiv.org/format/2412.16516">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> HammerBench: Fine-Grained Function-Calling Evaluation in Real Mobile Device Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiamu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+M">Muning Wen</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+X">Xiaoyun Mo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qiqiang Lin</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xihuai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Q">Qiuying Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16516v1-abstract-short" style="display: inline;"> Evaluating the capabilities of large language models (LLMs) in human-LLM interactions remains challenging due to the inherent complexity and openness of dialogue processes. This paper introduces HammerBench, a novel benchmarking framework designed to assess the function-calling ability of LLMs more effectively in such interactions. We model a wide range of real-world user scenarios on mobile devic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16516v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16516v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16516v1-abstract-full" style="display: none;"> Evaluating the capabilities of large language models (LLMs) in human-LLM interactions remains challenging due to the inherent complexity and openness of dialogue processes. This paper introduces HammerBench, a novel benchmarking framework designed to assess the function-calling ability of LLMs more effectively in such interactions. We model a wide range of real-world user scenarios on mobile devices, encompassing imperfect instructions, diverse question-answer trajectories, intent/argument shifts, and the use of external individual information through pronouns. To construct the corresponding datasets, we propose a comprehensive pipeline that involves LLM-generated data and multiple rounds of human validation, ensuring high data quality. Additionally, we decompose the conversations into function-calling snapshots, enabling a fine-grained evaluation of each turn. We evaluate several popular LLMs using HammerBench and highlight different performance aspects. Our empirical findings reveal that errors in parameter naming constitute the primary factor behind conversation failures across different data types. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16516v1-abstract-full').style.display = 'none'; document.getElementById('2412.16516v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15689">arXiv:2412.15689</a> <span> [<a href="https://arxiv.org/pdf/2412.15689">pdf</a>, <a href="https://arxiv.org/format/2412.15689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DOLLAR: Few-Step Video Generation via Distillation and Latent Reward Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zihan Ding</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Difan Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haitian Zheng</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+K+K">Krishna Kumar Singh</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yan Kang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuchen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15689v1-abstract-short" style="display: inline;"> Diffusion probabilistic models have shown significant progress in video generation; however, their computational efficiency is limited by the large number of sampling steps required. Reducing sampling steps often compromises video quality or generation diversity. In this work, we introduce a distillation method that combines variational score distillation and consistency distillation to achieve fe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15689v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15689v1-abstract-full" style="display: none;"> Diffusion probabilistic models have shown significant progress in video generation; however, their computational efficiency is limited by the large number of sampling steps required. Reducing sampling steps often compromises video quality or generation diversity. In this work, we introduce a distillation method that combines variational score distillation and consistency distillation to achieve few-step video generation, maintaining both high quality and diversity. We also propose a latent reward model fine-tuning approach to further enhance video generation performance according to any specified reward metric. This approach reduces memory usage and does not require the reward to be differentiable. Our method demonstrates state-of-the-art performance in few-step generation for 10-second videos (128 frames at 12 FPS). The distilled student model achieves a score of 82.57 on VBench, surpassing the teacher model as well as baseline models Gen-3, T2V-Turbo, and Kling. One-step distillation accelerates the teacher model's diffusion sampling by up to 278.6 times, enabling near real-time generation. Human evaluations further validate the superior performance of our 4-step student models compared to teacher model using 50-step DDIM sampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15689v1-abstract-full').style.display = 'none'; document.getElementById('2412.15689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14474">arXiv:2412.14474</a> <span> [<a href="https://arxiv.org/pdf/2412.14474">pdf</a>, <a href="https://arxiv.org/format/2412.14474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Benign Overfitting in Out-of-Distribution Generalization of Linear Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shange Tang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiayun Wu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jianqing Fan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14474v1-abstract-short" style="display: inline;"> Benign overfitting refers to the phenomenon where an over-parameterized model fits the training data perfectly, including noise in the data, but still generalizes well to the unseen test data. While prior work provides some theoretical understanding of this phenomenon under the in-distribution setup, modern machine learning often operates in a more challenging Out-of-Distribution (OOD) regime, whe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14474v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14474v1-abstract-full" style="display: none;"> Benign overfitting refers to the phenomenon where an over-parameterized model fits the training data perfectly, including noise in the data, but still generalizes well to the unseen test data. While prior work provides some theoretical understanding of this phenomenon under the in-distribution setup, modern machine learning often operates in a more challenging Out-of-Distribution (OOD) regime, where the target (test) distribution can be rather different from the source (training) distribution. In this work, we take an initial step towards understanding benign overfitting in the OOD regime by focusing on the basic setup of over-parameterized linear models under covariate shift. We provide non-asymptotic guarantees proving that benign overfitting occurs in standard ridge regression, even under the OOD regime when the target covariance satisfies certain structural conditions. We identify several vital quantities relating to source and target covariance, which govern the performance of OOD generalization. Our result is sharp, which provably recovers prior in-distribution benign overfitting guarantee [Tsigler and Bartlett, 2023], as well as under-parameterized OOD guarantee [Ge et al., 2024] when specializing to each setup. Moreover, we also present theoretical results for a more general family of target covariance matrix, where standard ridge regression only achieves a slow statistical rate of $O(1/\sqrt{n})$ for the excess risk, while Principal Component Regression (PCR) is guaranteed to achieve the fast rate $O(1/n)$, where $n$ is the number of samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14474v1-abstract-full').style.display = 'none'; document.getElementById('2412.14474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">58 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08941">arXiv:2412.08941</a> <span> [<a href="https://arxiv.org/pdf/2412.08941">pdf</a>, <a href="https://arxiv.org/format/2412.08941">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimized Gradient Clipping for Noisy Label Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xichen Ye</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yifan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weizhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoqiang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08941v4-abstract-short" style="display: inline;"> Previous research has shown that constraining the gradient of loss function with respect to model-predicted probabilities can enhance the model robustness against noisy labels. These methods typically specify a fixed optimal threshold for gradient clipping through validation data to obtain the desired robustness against noise. However, this common practice overlooks the dynamic distribution of gra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08941v4-abstract-full').style.display = 'inline'; document.getElementById('2412.08941v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08941v4-abstract-full" style="display: none;"> Previous research has shown that constraining the gradient of loss function with respect to model-predicted probabilities can enhance the model robustness against noisy labels. These methods typically specify a fixed optimal threshold for gradient clipping through validation data to obtain the desired robustness against noise. However, this common practice overlooks the dynamic distribution of gradients from both clean and noisy-labeled samples at different stages of training, significantly limiting the model capability to adapt to the variable nature of gradients throughout the training process. To address this issue, we propose a simple yet effective approach called Optimized Gradient Clipping (OGC), which dynamically adjusts the clipping threshold based on the ratio of noise gradients to clean gradients after clipping, estimated by modeling the distributions of clean and noisy samples. This approach allows us to modify the clipping threshold at each training step, effectively controlling the influence of noise gradients. Additionally, we provide statistical analysis to certify the noise-tolerance ability of OGC. Our extensive experiments across various types of label noise, including symmetric, asymmetric, instance-dependent, and real-world noise, demonstrate the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08941v4-abstract-full').style.display = 'none'; document.getElementById('2412.08941v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.5.1; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06847">arXiv:2412.06847</a> <span> [<a href="https://arxiv.org/pdf/2412.06847">pdf</a>, <a href="https://arxiv.org/format/2412.06847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> M$^{3}$-20M: A Large-Scale Multi-Modal Molecule Dataset for AI-driven Drug Design and Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Siyuan Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lexuan Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chang Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinxian Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Han Peng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huayang Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wengen Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jihong Guan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuigeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06847v1-abstract-short" style="display: inline;"> This paper introduces M$^{3}$-20M, a large-scale Multi-Modal Molecular dataset that contains over 20 million molecules. Designed to support AI-driven drug design and discovery, M$^{3}$-20M is 71 times more in the number of molecules than the largest existing dataset, providing an unprecedented scale that can highly benefit training or fine-tuning large (language) models with superior performance f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06847v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06847v1-abstract-full" style="display: none;"> This paper introduces M$^{3}$-20M, a large-scale Multi-Modal Molecular dataset that contains over 20 million molecules. Designed to support AI-driven drug design and discovery, M$^{3}$-20M is 71 times more in the number of molecules than the largest existing dataset, providing an unprecedented scale that can highly benefit training or fine-tuning large (language) models with superior performance for drug design and discovery. This dataset integrates one-dimensional SMILES, two-dimensional molecular graphs, three-dimensional molecular structures, physicochemical properties, and textual descriptions collected through web crawling and generated by using GPT-3.5, offering a comprehensive view of each molecule. To demonstrate the power of M$^{3}$-20M in drug design and discovery, we conduct extensive experiments on two key tasks: molecule generation and molecular property prediction, using large language models including GLM4, GPT-3.5, and GPT-4. Our experimental results show that M$^{3}$-20M can significantly boost model performance in both tasks. Specifically, it enables the models to generate more diverse and valid molecular structures and achieve higher property prediction accuracy than the existing single-modal datasets, which validates the value and potential of M$^{3}$-20M in supporting AI-driven drug design and discovery. The dataset is available at \url{https://github.com/bz99bz/M-3}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06847v1-abstract-full').style.display = 'none'; document.getElementById('2412.06847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05596">arXiv:2412.05596</a> <span> [<a href="https://arxiv.org/pdf/2412.05596">pdf</a>, <a href="https://arxiv.org/format/2412.05596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TB-HSU: Hierarchical 3D Scene Understanding with Contextual Affordances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenting Xu</a>, <a href="/search/cs?searchtype=author&query=Ila%2C+V">Viorela Ila</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C+T">Craig T. Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05596v1-abstract-short" style="display: inline;"> The concept of function and affordance is a critical aspect of 3D scene understanding and supports task-oriented objectives. In this work, we develop a model that learns to structure and vary functional affordance across a 3D hierarchical scene graph representing the spatial organization of a scene. The varying functional affordance is designed to integrate with the varying spatial context of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05596v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05596v1-abstract-full" style="display: none;"> The concept of function and affordance is a critical aspect of 3D scene understanding and supports task-oriented objectives. In this work, we develop a model that learns to structure and vary functional affordance across a 3D hierarchical scene graph representing the spatial organization of a scene. The varying functional affordance is designed to integrate with the varying spatial context of the graph. More specifically, we develop an algorithm that learns to construct a 3D hierarchical scene graph (3DHSG) that captures the spatial organization of the scene. Starting from segmented object point clouds and object semantic labels, we develop a 3DHSG with a top node that identifies the room label, child nodes that define local spatial regions inside the room with region-specific affordances, and grand-child nodes indicating object locations and object-specific affordances. To support this work, we create a custom 3DHSG dataset that provides ground truth data for local spatial regions with region-specific affordances and also object-specific affordances for each object. We employ a transformer-based model to learn the 3DHSG. We use a multi-task learning framework that learns both room classification and learns to define spatial regions within the room with region-specific affordances. Our work improves on the performance of state-of-the-art baseline models and shows one approach for applying transformer models to 3D scene understanding and the generation of 3DHSGs that capture the spatial organization of a room. The code and dataset are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05596v1-abstract-full').style.display = 'none'; document.getElementById('2412.05596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05486">arXiv:2412.05486</a> <span> [<a href="https://arxiv.org/pdf/2412.05486">pdf</a>, <a href="https://arxiv.org/format/2412.05486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Scene Representation for Online Spatial Sonification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lan Wu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Craig Jin</a>, <a href="/search/cs?searchtype=author&query=Uttsha%2C+M+M">Monisha Mushtary Uttsha</a>, <a href="/search/cs?searchtype=author&query=Vidal-Calleja%2C+T">Teresa Vidal-Calleja</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05486v1-abstract-short" style="display: inline;"> Robotic perception is emerging as a crucial technology for navigation aids, particularly benefiting individuals with visual impairments through sonification. This paper presents a novel mapping framework that accurately represents spatial geometry for sonification, transforming physical spaces into auditory experiences. By leveraging depth sensors, we convert incrementally built 3D scenes into a c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05486v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05486v1-abstract-full" style="display: none;"> Robotic perception is emerging as a crucial technology for navigation aids, particularly benefiting individuals with visual impairments through sonification. This paper presents a novel mapping framework that accurately represents spatial geometry for sonification, transforming physical spaces into auditory experiences. By leveraging depth sensors, we convert incrementally built 3D scenes into a compact 360-degree representation based on angular and distance information, aligning with human auditory perception. Our proposed mapping framework utilises a sensor-centric structure, maintaining 2D circular or 3D cylindrical representations, and employs the VDB-GPDF for efficient online mapping. We introduce two sonification modes-circular ranging and circular ranging of objects-along with real-time user control over auditory filters. Incorporating binaural room impulse responses, our framework provides perceptually robust auditory feedback. Quantitative and qualitative evaluations demonstrate superior performance in accuracy, coverage, and timing compared to existing approaches, with effective handling of dynamic objects. The accompanying video showcases the practical application of spatial sonification in room-like environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05486v1-abstract-full').style.display = 'none'; document.getElementById('2412.05486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04814">arXiv:2412.04814</a> <span> [<a href="https://arxiv.org/pdf/2412.04814">pdf</a>, <a href="https://arxiv.org/format/2412.04814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LiFT: Leveraging Human Feedback for Text-to-Video Model Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yibin Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhiyu Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyan Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaomeng Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04814v2-abstract-short" style="display: inline;"> Recent advancements in text-to-video (T2V) generative models have shown impressive capabilities. However, these models are still inadequate in aligning synthesized videos with human preferences (e.g., accurately reflecting text descriptions), which is particularly difficult to address, as human preferences are inherently subjective and challenging to formalize as objective functions. Therefore, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04814v2-abstract-full').style.display = 'inline'; document.getElementById('2412.04814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04814v2-abstract-full" style="display: none;"> Recent advancements in text-to-video (T2V) generative models have shown impressive capabilities. However, these models are still inadequate in aligning synthesized videos with human preferences (e.g., accurately reflecting text descriptions), which is particularly difficult to address, as human preferences are inherently subjective and challenging to formalize as objective functions. Therefore, this paper proposes LiFT, a novel fine-tuning method leveraging human feedback for T2V model alignment. Specifically, we first construct a Human Rating Annotation dataset, LiFT-HRA, consisting of approximately 10k human annotations, each including a score and its corresponding rationale. Based on this, we train a reward model LiFT-Critic to learn reward function effectively, which serves as a proxy for human judgment, measuring the alignment between given videos and human expectations. Lastly, we leverage the learned reward function to align the T2V model by maximizing the reward-weighted likelihood. As a case study, we apply our pipeline to CogVideoX-2B, showing that the fine-tuned model outperforms the CogVideoX-5B across all 16 metrics, highlighting the potential of human feedback in improving the alignment and quality of synthesized videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04814v2-abstract-full').style.display = 'none'; document.getElementById('2412.04814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://codegoat24.github.io/LiFT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03822">arXiv:2412.03822</a> <span> [<a href="https://arxiv.org/pdf/2412.03822">pdf</a>, <a href="https://arxiv.org/format/2412.03822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Beyond the Binary: Capturing Diverse Preferences With Reward Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Padmakumar%2C+V">Vishakh Padmakumar</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Kirk%2C+H+R">Hannah Rose Kirk</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">He He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03822v1-abstract-short" style="display: inline;"> Large language models (LLMs) are increasingly deployed via public-facing interfaces to interact with millions of users, each with diverse preferences. Despite this, preference tuning of LLMs predominantly relies on reward models trained using binary judgments where annotators select the preferred choice out of pairs of model outputs. In this work, we argue that this reliance on binary choices does… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03822v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03822v1-abstract-full" style="display: none;"> Large language models (LLMs) are increasingly deployed via public-facing interfaces to interact with millions of users, each with diverse preferences. Despite this, preference tuning of LLMs predominantly relies on reward models trained using binary judgments where annotators select the preferred choice out of pairs of model outputs. In this work, we argue that this reliance on binary choices does not capture the broader, aggregate preferences of the target user in real-world tasks. We propose a taxonomy that identifies two dimensions of subjectivity where different users disagree on the preferred output-namely, the Plurality of Responses to Prompts, where prompts allow for multiple correct answers, and the Indistinguishability of Responses, where candidate outputs are paraphrases of each other. We show that reward models correlate weakly with user preferences in these cases. As a first step to address this issue, we introduce a simple yet effective method that augments existing binary preference datasets with synthetic preference judgments to estimate potential user disagreement. Incorporating these via a margin term as a form of regularization during model training yields predictions that better align with the aggregate user preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03822v1-abstract-full').style.display = 'none'; document.getElementById('2412.03822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00091">arXiv:2412.00091</a> <span> [<a href="https://arxiv.org/pdf/2412.00091">pdf</a>, <a href="https://arxiv.org/format/2412.00091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Graph Canvas for Controllable 3D Scene Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Libin Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shen Chen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Sen Jia</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jingzhe Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Can Jin</a>, <a href="/search/cs?searchtype=author&query=Zongkai%2C+W">Wu Zongkai</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jenq-Neng Hwang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00091v2-abstract-short" style="display: inline;"> Spatial intelligence is foundational to AI systems that interact with the physical world, particularly in 3D scene generation and spatial comprehension. Current methodologies for 3D scene generation often rely heavily on predefined datasets, and struggle to adapt dynamically to changing spatial relationships. In this paper, we introduce GraphCanvas3D, a programmable, extensible, and adaptable fram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00091v2-abstract-full').style.display = 'inline'; document.getElementById('2412.00091v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00091v2-abstract-full" style="display: none;"> Spatial intelligence is foundational to AI systems that interact with the physical world, particularly in 3D scene generation and spatial comprehension. Current methodologies for 3D scene generation often rely heavily on predefined datasets, and struggle to adapt dynamically to changing spatial relationships. In this paper, we introduce GraphCanvas3D, a programmable, extensible, and adaptable framework for controllable 3D scene generation. Leveraging in-context learning, GraphCanvas3D enables dynamic adaptability without the need for retraining, supporting flexible and customizable scene creation. Our framework employs hierarchical, graph-driven scene descriptions, representing spatial elements as graph nodes and establishing coherent relationships among objects in 3D environments. Unlike conventional approaches, which are constrained in adaptability and often require predefined input masks or retraining for modifications, GraphCanvas3D allows for seamless object manipulation and scene adjustments on the fly. Additionally, GraphCanvas3D supports 4D scene generation, incorporating temporal dynamics to model changes over time. Experimental results and user studies demonstrate that GraphCanvas3D enhances usability, flexibility, and adaptability for scene generation. Our code and models are available on the project website: https://github.com/ILGLJ/Graph-Canvas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00091v2-abstract-full').style.display = 'none'; document.getElementById('2412.00091v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12180">arXiv:2411.12180</a> <span> [<a href="https://arxiv.org/pdf/2411.12180">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Quantifying the Innovativeness of Celebrated Scientists and Their Embeddedness in Collaboration Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chaolin Tian</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yurui Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Ching Jin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yifang Ma</a>, <a href="/search/cs?searchtype=author&query=Uzzi%2C+B">Brian Uzzi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12180v1-abstract-short" style="display: inline;"> Matthew effects, or the tendency for early achievements in science to lead to more recognition and opportunities, are a potential source of stratification and lost innovation when they draw unreasonable attention away from equally innovative but less celebrated scholars. Here, we analyze whether prizewinners produce more innovative works before and after being awarded a prize compared to equivalen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12180v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12180v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12180v1-abstract-full" style="display: none;"> Matthew effects, or the tendency for early achievements in science to lead to more recognition and opportunities, are a potential source of stratification and lost innovation when they draw unreasonable attention away from equally innovative but less celebrated scholars. Here, we analyze whether prizewinners produce more innovative works before and after being awarded a prize compared to equivalently impactful non-prizewinning contenders. Our data covers the careers of prizewinners and their dynamically matched non-prizewinners, a longitudinal, science-wide sample of 23,562 scholars and 5.7 million publications. We measured the innovativeness of prizewinners' and non-prizewinners' publications in terms of their novelty, convergent thinking, and interdisciplinarity. We find that prizewinners display distinctive forms of innovativeness relative to their non-prizewinning counterparts in terms of combining ideas in novel ways, bridging foundational and cutting-edge work on a topic, and formulating approaches to problems that leverage the strengths of interdisciplinarity. Further, prizewinners' innovativeness is strongly predicted by their type of network embeddedness. In contrast to matched non-prizewinners, prizewinners have shorter-term collaborations, their collaborators tend to focus their attention on topics that are new to the prizewinners, and their collaborators' collaborators have minimal overlap. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12180v1-abstract-full').style.display = 'none'; document.getElementById('2411.12180v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07660">arXiv:2411.07660</a> <span> [<a href="https://arxiv.org/pdf/2411.07660">pdf</a>, <a href="https://arxiv.org/format/2411.07660">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HMIL: Hierarchical Multi-Instance Learning for Fine-Grained Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huangjing Lin</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jun Hou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07660v2-abstract-short" style="display: inline;"> Fine-grained classification of whole slide images (WSIs) is essential in precision oncology, enabling precise cancer diagnosis and personalized treatment strategies. The core of this task involves distinguishing subtle morphological variations within the same broad category of gigapixel-resolution images, which presents a significant challenge. While the multi-instance learning (MIL) paradigm alle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07660v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07660v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07660v2-abstract-full" style="display: none;"> Fine-grained classification of whole slide images (WSIs) is essential in precision oncology, enabling precise cancer diagnosis and personalized treatment strategies. The core of this task involves distinguishing subtle morphological variations within the same broad category of gigapixel-resolution images, which presents a significant challenge. While the multi-instance learning (MIL) paradigm alleviates the computational burden of WSIs, existing MIL methods often overlook hierarchical label correlations, treating fine-grained classification as a flat multi-class classification task. To overcome these limitations, we introduce a novel hierarchical multi-instance learning (HMIL) framework. By facilitating on the hierarchical alignment of inherent relationships between different hierarchy of labels at instance and bag level, our approach provides a more structured and informative learning process. Specifically, HMIL incorporates a class-wise attention mechanism that aligns hierarchical information at both the instance and bag levels. Furthermore, we introduce supervised contrastive learning to enhance the discriminative capability for fine-grained classification and a curriculum-based dynamic weighting module to adaptively balance the hierarchical feature during training. Extensive experiments on our large-scale cytology cervical cancer (CCC) dataset and two public histology datasets, BRACS and PANDA, demonstrate the state-of-the-art class-wise and overall performance of our HMIL framework. Our source code is available at https://github.com/ChengJin-git/HMIL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07660v2-abstract-full').style.display = 'none'; document.getElementById('2411.07660v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00769">arXiv:2411.00769</a> <span> [<a href="https://arxiv.org/pdf/2411.00769">pdf</a>, <a href="https://arxiv.org/format/2411.00769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GameGen-X: Interactive Open-world Game Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Che%2C+H">Haoxuan Che</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xuanhua He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Quande Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00769v3-abstract-short" style="display: inline;"> We introduce GameGen-X, the first diffusion transformer model specifically designed for both generating and interactively controlling open-world game videos. This model facilitates high-quality, open-domain generation by simulating an extensive array of game engine features, such as innovative characters, dynamic environments, complex actions, and diverse events. Additionally, it provides interact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00769v3-abstract-full').style.display = 'inline'; document.getElementById('2411.00769v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00769v3-abstract-full" style="display: none;"> We introduce GameGen-X, the first diffusion transformer model specifically designed for both generating and interactively controlling open-world game videos. This model facilitates high-quality, open-domain generation by simulating an extensive array of game engine features, such as innovative characters, dynamic environments, complex actions, and diverse events. Additionally, it provides interactive controllability, predicting and altering future content based on the current clip, thus allowing for gameplay simulation. To realize this vision, we first collected and built an Open-World Video Game Dataset from scratch. It is the first and largest dataset for open-world game video generation and control, which comprises over a million diverse gameplay video clips sampling from over 150 games with informative captions from GPT-4o. GameGen-X undergoes a two-stage training process, consisting of foundation model pre-training and instruction tuning. Firstly, the model was pre-trained via text-to-video generation and video continuation, endowing it with the capability for long-sequence, high-quality open-domain game video generation. Further, to achieve interactive controllability, we designed InstructNet to incorporate game-related multi-modal control signal experts. This allows the model to adjust latent representations based on user inputs, unifying character interaction and scene content control for the first time in video generation. During instruction tuning, only the InstructNet is updated while the pre-trained foundation model is frozen, enabling the integration of interactive controllability without loss of diversity and quality of generated video content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00769v3-abstract-full').style.display = 'none'; document.getElementById('2411.00769v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Homepage: https://gamegen-x.github.io/ Github: https://github.com/GameGen-X/GameGen-X</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22267">arXiv:2410.22267</a> <span> [<a href="https://arxiv.org/pdf/2410.22267">pdf</a>, <a href="https://arxiv.org/ps/2410.22267">ps</a>, <a href="https://arxiv.org/format/2410.22267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Approximately Counting Knapsack Solutions in Subquadratic Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+W">Weiming Feng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Ce Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22267v1-abstract-short" style="display: inline;"> We revisit the classic #Knapsack problem, which asks to count the Boolean points $(x_1,\dots,x_n)\in\{0,1\}^n$ in a given half-space $\sum_{i=1}^nW_ix_i\le T$. This #P-complete problem admits $(1\pm蔚)$-approximation. Before this work, [Dyer, STOC 2003]'s $\tilde{O}(n^{2.5}+n^2{蔚^{-2}})$-time randomized approximation scheme remains the fastest known in the natural regime of $蔚\ge 1/polylog(n)$. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22267v1-abstract-full" style="display: none;"> We revisit the classic #Knapsack problem, which asks to count the Boolean points $(x_1,\dots,x_n)\in\{0,1\}^n$ in a given half-space $\sum_{i=1}^nW_ix_i\le T$. This #P-complete problem admits $(1\pm蔚)$-approximation. Before this work, [Dyer, STOC 2003]'s $\tilde{O}(n^{2.5}+n^2{蔚^{-2}})$-time randomized approximation scheme remains the fastest known in the natural regime of $蔚\ge 1/polylog(n)$. In this paper, we give a randomized $(1\pm蔚)$-approximation algorithm in $\tilde{O}(n^{1.5}{蔚^{-2}})$ time (in the standard word-RAM model), achieving the first sub-quadratic dependence on $n$. Such sub-quadratic running time is rare in the approximate counting literature in general, as a large class of algorithms naturally faces a quadratic-time barrier. Our algorithm follows Dyer's framework, which reduces #Knapsack to the task of sampling (and approximately counting) solutions in a randomly rounded instance with poly(n)-bounded integer weights. We refine Dyer's framework using the following ideas: - We decrease the sample complexity of Dyer's Monte Carlo method, by proving some structural lemmas for typical points near the input hyperplane via hitting-set arguments, and appropriately setting the rounding scale. - Instead of running a vanilla dynamic program on the rounded instance, we employ techniques from the growing field of pseudopolynomial-time Subset Sum algorithms, such as FFT, divide-and-conquer, and balls-into-bins hashing of [Bringmann, SODA 2017]. We also need other ingredients, including a surprising application of the recent Bounded Monotone (max,+)-Convolution algorithm by [Chi-Duan-Xie-Zhang, STOC 2022] (adapted by [Bringmann-D眉rr-Polak, ESA 2024]), the notion of sum-approximation from [Gawrychowski-Markin-Weimann, ICALP 2018]'s #Knapsack approximation scheme, and a two-phase extension of Dyer's framework for handling tiny weights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22267v1-abstract-full').style.display = 'none'; document.getElementById('2410.22267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at SODA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20764">arXiv:2410.20764</a> <span> [<a href="https://arxiv.org/pdf/2410.20764">pdf</a>, <a href="https://arxiv.org/ps/2410.20764">ps</a>, <a href="https://arxiv.org/format/2410.20764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> New Applications of 3SUM-Counting in Fine-Grained Complexity and Pattern Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fischer%2C+N">Nick Fischer</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Ce Jin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinzhan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20764v1-abstract-short" style="display: inline;"> The 3SUM problem is one of the cornerstones of fine-grained complexity. Its study has led to countless lower bounds, but as has been sporadically observed before -- and as we will demonstrate again -- insights on 3SUM can also lead to algorithmic applications. The starting point of our work is that we spend a lot of technical effort to develop new algorithms for 3SUM-type problems such as approx… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20764v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20764v1-abstract-full" style="display: none;"> The 3SUM problem is one of the cornerstones of fine-grained complexity. Its study has led to countless lower bounds, but as has been sporadically observed before -- and as we will demonstrate again -- insights on 3SUM can also lead to algorithmic applications. The starting point of our work is that we spend a lot of technical effort to develop new algorithms for 3SUM-type problems such as approximate 3SUM-counting, small-doubling 3SUM-counting, and a deterministic subquadratic-time algorithm for the celebrated Balog-Szemer茅di-Gowers theorem from additive combinatorics. As consequences of these tools, we derive diverse new results in fine-grained complexity and pattern matching algorithms, answering open questions from many unrelated research areas. Specifically: - A recent line of research on the "short cycle removal" technique culminated in tight 3SUM-based lower bounds for various graph problems via randomized fine-grained reductions [Abboud, Bringmann, Fischer; STOC '23] [Jin, Xu; STOC '23]. In this paper we derandomize the reduction to the important 4-Cycle Listing problem. - We establish that \#3SUM and 3SUM are fine-grained equivalent under deterministic reductions. - We give a deterministic algorithm for the $(1+蔚)$-approximate Text-to-Pattern Hamming Distances problem in time $n^{1+o(1)} \cdot 蔚^{-1}$. - In the $k$-Mismatch Constellation problem the input consists of two integer sets $A, B \subseteq [N]$, and the goal is to test whether there is a shift $c$ such that $|(c + B) \setminus A| \leq k$ (i.e., whether $B$ shifted by $c$ matches $A$ up to $k$ mismatches). For moderately small $k$ the previously best running time was $\tilde O(|A| \cdot k)$ [Cardoze, Schulman; FOCS '98] [Fischer; SODA '24]. We give a faster $|A| \cdot k^{2/3} \cdot N^{o(1)}$-time algorithm in the regime where $|B| = 螛(|A|)$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20764v1-abstract-full').style.display = 'none'; document.getElementById('2410.20764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in SODA 2025. Abstract shortened to fit arXiv requirements</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18860">arXiv:2410.18860</a> <span> [<a href="https://arxiv.org/pdf/2410.18860">pdf</a>, <a href="https://arxiv.org/format/2410.18860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeCoRe: Decoding by Contrasting Retrieval Heads to Mitigate Hallucinations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gema%2C+A+P">Aryo Pradipta Gema</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chen Jin</a>, <a href="/search/cs?searchtype=author&query=Abdulaal%2C+A">Ahmed Abdulaal</a>, <a href="/search/cs?searchtype=author&query=Diethe%2C+T">Tom Diethe</a>, <a href="/search/cs?searchtype=author&query=Teare%2C+P">Philip Teare</a>, <a href="/search/cs?searchtype=author&query=Alex%2C+B">Beatrice Alex</a>, <a href="/search/cs?searchtype=author&query=Minervini%2C+P">Pasquale Minervini</a>, <a href="/search/cs?searchtype=author&query=Saseendran%2C+A">Amrutha Saseendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18860v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) often hallucinate, producing unfaithful or factually incorrect outputs by misrepresenting the provided context or incorrectly recalling internal knowledge. Recent studies have identified specific attention heads within the Transformer architecture, known as retrieval heads, responsible for extracting relevant contextual information. We hypothesise that masking these re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18860v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18860v1-abstract-full" style="display: none;"> Large Language Models (LLMs) often hallucinate, producing unfaithful or factually incorrect outputs by misrepresenting the provided context or incorrectly recalling internal knowledge. Recent studies have identified specific attention heads within the Transformer architecture, known as retrieval heads, responsible for extracting relevant contextual information. We hypothesise that masking these retrieval heads can induce hallucinations and that contrasting the outputs of the base LLM and the masked LLM can reduce hallucinations. To this end, we propose Decoding by Contrasting Retrieval Heads (DeCoRe), a novel training-free decoding strategy that amplifies information found in the context and model parameters. DeCoRe mitigates potentially hallucinated responses by dynamically contrasting the outputs of the base LLM and the masked LLM, using conditional entropy as a guide. Our extensive experiments confirm that DeCoRe significantly improves performance on tasks requiring high contextual faithfulness, such as summarisation (XSum by 18.6%), instruction following (MemoTrap by 10.9%), and open-book question answering (NQ-Open by 2.4% and NQ-Swap by 5.5%). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18860v1-abstract-full').style.display = 'none'; document.getElementById('2410.18860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14795">arXiv:2410.14795</a> <span> [<a href="https://arxiv.org/pdf/2410.14795">pdf</a>, <a href="https://arxiv.org/format/2410.14795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Cross-Document Event-Keyed Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Walden%2C+W">William Walden</a>, <a href="/search/cs?searchtype=author&query=Kuchmiichuk%2C+P">Pavlo Kuchmiichuk</a>, <a href="/search/cs?searchtype=author&query=Martin%2C+A">Alexander Martin</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chihsheng Jin</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+A">Angela Cao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Claire Sun</a>, <a href="/search/cs?searchtype=author&query=Allen%2C+C">Curisia Allen</a>, <a href="/search/cs?searchtype=author&query=White%2C+A+S">Aaron Steven White</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14795v2-abstract-short" style="display: inline;"> Event-keyed summarization (EKS) requires summarizing a specific event described in a document given the document text and an event representation extracted from it. In this work, we extend EKS to the cross-document setting (CDEKS), in which summaries must synthesize information from accounts of the same event as given by multiple sources. We introduce SEAMUS (Summaries of Events Across Multiple So… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14795v2-abstract-full').style.display = 'inline'; document.getElementById('2410.14795v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14795v2-abstract-full" style="display: none;"> Event-keyed summarization (EKS) requires summarizing a specific event described in a document given the document text and an event representation extracted from it. In this work, we extend EKS to the cross-document setting (CDEKS), in which summaries must synthesize information from accounts of the same event as given by multiple sources. We introduce SEAMUS (Summaries of Events Across Multiple Sources), a high-quality dataset for CDEKS based on an expert reannotation of the FAMUS dataset for cross-document argument extraction. We present a suite of baselines on SEAMUS -- covering both smaller, fine-tuned models, as well as zero- and few-shot prompted LLMs -- along with detailed ablations and a human evaluation study, showing SEAMUS to be a valuable benchmark for this new task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14795v2-abstract-full').style.display = 'none'; document.getElementById('2410.14795v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL Rolling Review long paper (in submission)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13523">arXiv:2410.13523</a> <span> [<a href="https://arxiv.org/pdf/2410.13523">pdf</a>, <a href="https://arxiv.org/format/2410.13523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can Medical Vision-Language Pre-training Succeed with Purely Synthetic Data? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Che Liu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haozhe Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yinda Chen</a>, <a href="/search/cs?searchtype=author&query=Qaiser%2C+T">Talha Qaiser</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chen Jin</a>, <a href="/search/cs?searchtype=author&query=Yousefi%2C+F">Fariba Yousefi</a>, <a href="/search/cs?searchtype=author&query=Burlutskiy%2C+N">Nikolay Burlutskiy</a>, <a href="/search/cs?searchtype=author&query=Arcucci%2C+R">Rossella Arcucci</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13523v1-abstract-short" style="display: inline;"> Medical Vision-Language Pre-training (MedVLP) has made significant progress in enabling zero-shot tasks for medical image understanding. However, training MedVLP models typically requires large-scale datasets with paired, high-quality image-text data, which are scarce in the medical domain. Recent advancements in Large Language Models (LLMs) and diffusion models have made it possible to generate l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13523v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13523v1-abstract-full" style="display: none;"> Medical Vision-Language Pre-training (MedVLP) has made significant progress in enabling zero-shot tasks for medical image understanding. However, training MedVLP models typically requires large-scale datasets with paired, high-quality image-text data, which are scarce in the medical domain. Recent advancements in Large Language Models (LLMs) and diffusion models have made it possible to generate large-scale synthetic image-text pairs. This raises the question: "Can MedVLP succeed using purely synthetic data?" To address this, we use off-the-shelf generative models to create synthetic radiology reports and paired Chest X-ray (CXR) images, and propose an automated pipeline to build a diverse, high-quality synthetic dataset, enabling a rigorous study that isolates model and training settings, focusing entirely from the data perspective. Our results show that MedVLP models trained exclusively on synthetic data outperform those trained on real data by 3.8% in averaged AUC on zero-shot classification. Moreover, using a combination of synthetic and real data leads to a further improvement of 9.07%. Additionally, MedVLP models trained on synthetic or mixed data consistently outperform those trained on real data in zero-shot grounding, as well as in fine-tuned classification and segmentation tasks. Our analysis suggests MedVLP trained on well-designed synthetic data can outperform models trained on real datasets, which may be limited by low-quality samples and long-tailed distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13523v1-abstract-full').style.display = 'none'; document.getElementById('2410.13523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03226">arXiv:2410.03226</a> <span> [<a href="https://arxiv.org/pdf/2410.03226">pdf</a>, <a href="https://arxiv.org/format/2410.03226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Frame-Voyager: Learning to Query Frames for Video Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+S">Sicheng Yu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chengkai Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenghao Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Sheng Jin</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+Z">Zhongrong Zuo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolei Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhenbang Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bingni Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiawei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qianru Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03226v2-abstract-short" style="display: inline;"> Video Large Language Models (Video-LLMs) have made remarkable progress in video understanding tasks. However, they are constrained by the maximum length of input tokens, making it impractical to input entire videos. Existing frame selection approaches, such as uniform frame sampling and text-frame retrieval, fail to account for the information density variations in the videos or the complex instru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03226v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03226v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03226v2-abstract-full" style="display: none;"> Video Large Language Models (Video-LLMs) have made remarkable progress in video understanding tasks. However, they are constrained by the maximum length of input tokens, making it impractical to input entire videos. Existing frame selection approaches, such as uniform frame sampling and text-frame retrieval, fail to account for the information density variations in the videos or the complex instructions in the tasks, leading to sub-optimal performance. In this paper, we propose Frame-Voyager that learns to query informative frame combinations, based on the given textual queries in the task. To train Frame-Voyager, we introduce a new data collection and labeling pipeline, by ranking frame combinations using a pre-trained Video-LLM. Given a video of M frames, we traverse its T-frame combinations, feed them into a Video-LLM, and rank them based on Video-LLM's prediction losses. Using this ranking as supervision, we train Frame-Voyager to query the frame combinations with lower losses. In experiments, we evaluate Frame-Voyager on four Video Question Answering benchmarks by plugging it into two different Video-LLMs. The experimental results demonstrate that Frame-Voyager achieves impressive results in all settings, highlighting its potential as a plug-and-play solution for Video-LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03226v2-abstract-full').style.display = 'none'; document.getElementById('2410.03226v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08020">arXiv:2409.08020</a> <span> [<a href="https://arxiv.org/pdf/2409.08020">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Network Anomaly Traffic Detection via Multi-view Feature Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Song Hao</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+W">Wentao Fu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanze Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chengxiang Jin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiajun Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shanqing Yu</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+Q">Qi Xuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08020v1-abstract-short" style="display: inline;"> Traditional anomalous traffic detection methods are based on single-view analysis, which has obvious limitations in dealing with complex attacks and encrypted communications. In this regard, we propose a Multi-view Feature Fusion (MuFF) method for network anomaly traffic detection. MuFF models the temporal and interactive relationships of packets in network traffic based on the temporal and intera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08020v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08020v1-abstract-full" style="display: none;"> Traditional anomalous traffic detection methods are based on single-view analysis, which has obvious limitations in dealing with complex attacks and encrypted communications. In this regard, we propose a Multi-view Feature Fusion (MuFF) method for network anomaly traffic detection. MuFF models the temporal and interactive relationships of packets in network traffic based on the temporal and interactive viewpoints respectively. It learns temporal and interactive features. These features are then fused from different perspectives for anomaly traffic detection. Extensive experiments on six real traffic datasets show that MuFF has excellent performance in network anomalous traffic detection, which makes up for the shortcomings of detection under a single perspective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08020v1-abstract-full').style.display = 'none'; document.getElementById('2409.08020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in Chinese language, Accepted by Journal of Command and Control</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02392">arXiv:2409.02392</a> <span> [<a href="https://arxiv.org/pdf/2409.02392">pdf</a>, <a href="https://arxiv.org/format/2409.02392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Building Math Agents with Multi-Turn Iterative Preference Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chengshuai Shi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiaming Shen</a>, <a href="/search/cs?searchtype=author&query=Rosenberg%2C+A">Aviv Rosenberg</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&query=Khalman%2C+M">Misha Khalman</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&query=Piot%2C+B">Bilal Piot</a>, <a href="/search/cs?searchtype=author&query=Saleh%2C+M">Mohammad Saleh</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianqi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02392v1-abstract-short" style="display: inline;"> Recent studies have shown that large language models' (LLMs) mathematical problem-solving capabilities can be enhanced by integrating external tools, such as code interpreters, and employing multi-turn Chain-of-Thought (CoT) reasoning. While current methods focus on synthetic data generation and Supervised Fine-Tuning (SFT), this paper studies the complementary direct preference learning approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02392v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02392v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02392v1-abstract-full" style="display: none;"> Recent studies have shown that large language models' (LLMs) mathematical problem-solving capabilities can be enhanced by integrating external tools, such as code interpreters, and employing multi-turn Chain-of-Thought (CoT) reasoning. While current methods focus on synthetic data generation and Supervised Fine-Tuning (SFT), this paper studies the complementary direct preference learning approach to further improve model performance. However, existing direct preference learning algorithms are originally designed for the single-turn chat task, and do not fully address the complexities of multi-turn reasoning and external tool integration required for tool-integrated mathematical reasoning tasks. To fill in this gap, we introduce a multi-turn direct preference learning framework, tailored for this context, that leverages feedback from code interpreters and optimizes trajectory-level preferences. This framework includes multi-turn DPO and multi-turn KTO as specific implementations. The effectiveness of our framework is validated through training of various language models using an augmented prompt set from the GSM8K and MATH datasets. Our results demonstrate substantial improvements: a supervised fine-tuned Gemma-1.1-it-7B model's performance increased from 77.5% to 83.9% on GSM8K and from 46.1% to 51.2% on MATH. Similarly, a Gemma-2-it-9B model improved from 84.1% to 86.3% on GSM8K and from 51.0% to 54.5% on MATH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02392v1-abstract-full').style.display = 'none'; document.getElementById('2409.02392v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A multi-turn direct preference learning framework for tool-integrated reasoning tasks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12574">arXiv:2408.12574</a> <span> [<a href="https://arxiv.org/pdf/2408.12574">pdf</a>, <a href="https://arxiv.org/format/2408.12574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MuMA-ToM: Multi-modal Multi-Agent Theory of Mind </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haojun Shi</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Suyu Ye</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xinyu Fang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Isik%2C+L">Leyla Isik</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+Y">Yen-Ling Kuo</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12574v4-abstract-short" style="display: inline;"> Understanding people's social interactions in complex real-world scenarios often relies on intricate mental reasoning. To truly understand how and why people interact with one another, we must infer the underlying mental states that give rise to the social interactions, i.e., Theory of Mind reasoning in multi-agent interactions. Additionally, social interactions are often multi-modal -- we can wat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12574v4-abstract-full').style.display = 'inline'; document.getElementById('2408.12574v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12574v4-abstract-full" style="display: none;"> Understanding people's social interactions in complex real-world scenarios often relies on intricate mental reasoning. To truly understand how and why people interact with one another, we must infer the underlying mental states that give rise to the social interactions, i.e., Theory of Mind reasoning in multi-agent interactions. Additionally, social interactions are often multi-modal -- we can watch people's actions, hear their conversations, and/or read about their past behaviors. For AI systems to successfully and safely interact with people in real-world environments, they also need to understand people's mental states as well as their inferences about each other's mental states based on multi-modal information about their interactions. For this, we introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark. MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide video and text descriptions of people's multi-modal behavior in realistic household environments. Based on the context, we then ask questions about people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM in a human experiment and provided a human baseline. We also proposed a novel multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse Multi-agent Planning). Our experimental results show that LIMP significantly outperforms state-of-the-art methods, including large multi-modal models (e.g., GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12574v4-abstract-full').style.display = 'none'; document.getElementById('2408.12574v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI-25 (Oral). Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code: https://github.com/SCAI-JHU/MuMA-ToM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11974">arXiv:2408.11974</a> <span> [<a href="https://arxiv.org/pdf/2408.11974">pdf</a>, <a href="https://arxiv.org/format/2408.11974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Two-Timescale Gradient Descent Ascent Algorithms for Nonconvex Minimax Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tianyi Lin</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+M+I">Michael. I. Jordan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11974v3-abstract-short" style="display: inline;"> We provide a unified analysis of two-timescale gradient descent ascent (TTGDA) for solving structured nonconvex minimax optimization problems in the form of $\min_\textbf{x} \max_{\textbf{y} \in Y} f(\textbf{x}, \textbf{y})$, where the objective function $f(\textbf{x}, \textbf{y})$ is nonconvex in $\textbf{x}$ and concave in $\textbf{y}$, and the constraint set $Y \subseteq \mathbb{R}^n$ is convex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11974v3-abstract-full').style.display = 'inline'; document.getElementById('2408.11974v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11974v3-abstract-full" style="display: none;"> We provide a unified analysis of two-timescale gradient descent ascent (TTGDA) for solving structured nonconvex minimax optimization problems in the form of $\min_\textbf{x} \max_{\textbf{y} \in Y} f(\textbf{x}, \textbf{y})$, where the objective function $f(\textbf{x}, \textbf{y})$ is nonconvex in $\textbf{x}$ and concave in $\textbf{y}$, and the constraint set $Y \subseteq \mathbb{R}^n$ is convex and bounded. In the convex-concave setting, the single-timescale gradient descent ascent (GDA) algorithm is widely used in applications and has been shown to have strong convergence guarantees. In more general settings, however, it can fail to converge. Our contribution is to design TTGDA algorithms that are effective beyond the convex-concave setting, efficiently finding a stationary point of the function $桅(\cdot) := \max_{\textbf{y} \in Y} f(\cdot, \textbf{y})$. We also establish theoretical bounds on the complexity of solving both smooth and nonsmooth nonconvex-concave minimax optimization problems. To the best of our knowledge, this is the first systematic analysis of TTGDA for nonconvex minimax optimization, shedding light on its superior performance in training generative adversarial networks (GANs) and in other real-world application problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11974v3-abstract-full').style.display = 'none'; document.getElementById('2408.11974v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Journal of Machine Learning Research; A preliminary version [arXiv:1906.00331] of this paper, with a subset of the results that are presented here, was presented at ICML 2020; 44 Pages, 10 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07433">arXiv:2408.07433</a> <span> [<a href="https://arxiv.org/pdf/2408.07433">pdf</a>, <a href="https://arxiv.org/format/2408.07433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MagicFace: Training-free Universal-Style Human Image Customized Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yibin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weizhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07433v5-abstract-short" style="display: inline;"> Current human image customization methods leverage Stable Diffusion (SD) for its rich semantic prior. However, since SD is not specifically designed for human-oriented generation, these methods often require extensive fine-tuning on large-scale datasets, which renders them susceptible to overfitting and hinders their ability to personalize individuals with previously unseen styles. Moreover, these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07433v5-abstract-full').style.display = 'inline'; document.getElementById('2408.07433v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07433v5-abstract-full" style="display: none;"> Current human image customization methods leverage Stable Diffusion (SD) for its rich semantic prior. However, since SD is not specifically designed for human-oriented generation, these methods often require extensive fine-tuning on large-scale datasets, which renders them susceptible to overfitting and hinders their ability to personalize individuals with previously unseen styles. Moreover, these methods extensively focus on single-concept human image synthesis and lack the flexibility to customize individuals using multiple given concepts, thereby impeding their broader practical application. This paper proposes MagicFace, a novel training-free method for multi-concept universal-style human image personalized synthesis. Our core idea is to simulate how humans create images given specific concepts, i.e., first establish a semantic layout considering factors such as concepts' shape and posture, then optimize details by comparing with concepts at the pixel level. To implement this process, we introduce a coarse-to-fine generation pipeline, involving two sequential stages: semantic layout construction and concept feature injection. This is achieved by our Reference-aware Self-Attention (RSA) and Region-grouped Blend Attention (RBA) mechanisms. In the first stage, RSA enables the latent image to query features from all reference concepts simultaneously, extracting the overall semantic understanding to facilitate the initial semantic layout establishment. In the second stage, we employ an attention-based semantic segmentation method to pinpoint the latent generated regions of all concepts at each step. Following this, RBA divides the pixels of the latent image into semantic groups, with each group querying fine-grained features from the corresponding reference concept. Extensive experiments demonstrate the superiority of our MagicFace. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07433v5-abstract-full').style.display = 'none'; document.getElementById('2408.07433v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://codegoat24.github.io/MagicFace</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04172">arXiv:2408.04172</a> <span> [<a href="https://arxiv.org/pdf/2408.04172">pdf</a>, <a href="https://arxiv.org/format/2408.04172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MultiColor: Image Colorization by Learning from Multiple Color Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiangcheng Du</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhao Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanlong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuoyao Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yingbin Zheng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04172v1-abstract-short" style="display: inline;"> Deep networks have shown impressive performance in the image restoration tasks, such as image colorization. However, we find that previous approaches rely on the digital representation from single color model with a specific mapping function, a.k.a., color space, during the colorization pipeline. In this paper, we first investigate the modeling of different color spaces, and find each of them exhi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04172v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04172v1-abstract-full" style="display: none;"> Deep networks have shown impressive performance in the image restoration tasks, such as image colorization. However, we find that previous approaches rely on the digital representation from single color model with a specific mapping function, a.k.a., color space, during the colorization pipeline. In this paper, we first investigate the modeling of different color spaces, and find each of them exhibiting distinctive characteristics with unique distribution of colors. The complementarity among multiple color spaces leads to benefits for the image colorization task. We present MultiColor, a new learning-based approach to automatically colorize grayscale images that combines clues from multiple color spaces. Specifically, we employ a set of dedicated colorization modules for individual color space. Within each module, a transformer decoder is first employed to refine color query embeddings and then a color mapper produces color channel prediction using the embeddings and semantic features. With these predicted color channels representing various color spaces, a complementary network is designed to exploit the complementarity and generate pleasing and reasonable colorized images. We conduct extensive experiments on real-world datasets, and the results demonstrate superior performance over the state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04172v1-abstract-full').style.display = 'none'; document.getElementById('2408.04172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03508">arXiv:2408.03508</a> <span> [<a href="https://arxiv.org/pdf/2408.03508">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> SemiEpi: Self-driving, Closed-loop Multi-Step Growth of Semiconductor Heterostructures Guided by Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wenkang Zhan</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+K">Kaiyao Xin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shujie Pan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xiaotian Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruixiang Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhe Feng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chaoyuan Jin</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+H">Hui Cong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chi Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+T+K">Tien Khee Ng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siming Chen</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+C">Chunlai Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhanguo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03508v3-abstract-short" style="display: inline;"> The semiconductor industry has prioritized automating repetitive tasks through closed-loop, self-driving experimentation, accelerating the optimization of complex multi-step processes. The emergence of machine learning (ML) has ushered in self-driving processes with minimal human intervention. This work introduces SemiEpi, a self-driving platform designed to execute molecular beam epitaxy (MBE) gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03508v3-abstract-full').style.display = 'inline'; document.getElementById('2408.03508v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03508v3-abstract-full" style="display: none;"> The semiconductor industry has prioritized automating repetitive tasks through closed-loop, self-driving experimentation, accelerating the optimization of complex multi-step processes. The emergence of machine learning (ML) has ushered in self-driving processes with minimal human intervention. This work introduces SemiEpi, a self-driving platform designed to execute molecular beam epitaxy (MBE) growth of semiconductor heterostructures through multi-step processes, in-situ monitoring, and on-the-fly feedback control. By integrating standard reactor, parameter initialization, and multiple ML models, SemiEpi identifies optimal initial conditions and proposes experiments for multi-step heterostructure growth, eliminating the need for extensive expertise in MBE processes. SemiEpi initializes material growth parameters tailored to specific material characteristics, and fine-tuned control over the growth process is then achieved through ML optimization. We optimize the growth for InAs quantum dots (QDs) heterostructures to showcase the power of SemiEpi, achieving a QD density of 5E10/cm2, 1.6-fold increased photoluminescence (PL) intensity and reduced full width at half maximum (FWHM) of 29.13 meV. This work highlights the potential of closed-loop, ML-guided systems to address challenges in multi-step growth. Our method is critical to achieve repeatable materials growth using commercially scalable tools. Furthermore, our strategy facilitates developing a hardware-independent process and enhancing process repeatability and stability, even without exhaustive knowledge of growth parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03508v3-abstract-full').style.display = 'none'; document.getElementById('2408.03508v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00641">arXiv:2408.00641</a> <span> [<a href="https://arxiv.org/pdf/2408.00641">pdf</a>, <a href="https://arxiv.org/format/2408.00641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIFS.2024.3521611">10.1109/TIFS.2024.3521611 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Ethereum Fraud Detection via Generative and Contrastive Self-supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chenxiang Jin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiajun Zhou</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chenxuan Xie</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shanqing Yu</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+Q">Qi Xuan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoniu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00641v3-abstract-short" style="display: inline;"> The rampant fraudulent activities on Ethereum hinder the healthy development of the blockchain ecosystem, necessitating the reinforcement of regulations. However, multiple imbalances involving account interaction frequencies and interaction types in the Ethereum transaction environment pose significant challenges to data mining-based fraud detection research. To address this, we first propose the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00641v3-abstract-full').style.display = 'inline'; document.getElementById('2408.00641v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00641v3-abstract-full" style="display: none;"> The rampant fraudulent activities on Ethereum hinder the healthy development of the blockchain ecosystem, necessitating the reinforcement of regulations. However, multiple imbalances involving account interaction frequencies and interaction types in the Ethereum transaction environment pose significant challenges to data mining-based fraud detection research. To address this, we first propose the concept of meta-interactions to refine interaction behaviors in Ethereum, and based on this, we present a dual self-supervision enhanced Ethereum fraud detection framework, named Meta-IFD. This framework initially introduces a generative self-supervision mechanism to augment the interaction features of accounts, followed by a contrastive self-supervision mechanism to differentiate various behavior patterns, and ultimately characterizes the behavioral representations of accounts and mines potential fraud risks through multi-view interaction feature learning. Extensive experiments on real Ethereum datasets demonstrate the effectiveness and superiority of our framework in detecting common Ethereum fraud behaviors such as Ponzi schemes and phishing scams. Additionally, the generative module can effectively alleviate the interaction distribution imbalance in Ethereum data, while the contrastive module significantly enhances the framework's ability to distinguish different behavior patterns. The source code will be available in https://github.com/GISec-Team/Meta-IFD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00641v3-abstract-full').style.display = 'none'; document.getElementById('2408.00641v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Information Forensics & Security</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18449">arXiv:2407.18449</a> <span> [<a href="https://arxiv.org/pdf/2407.18449">pdf</a>, <a href="https://arxiv.org/format/2407.18449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards A Generalizable Pathology Foundation Model via Unified Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhengrui Guo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fengtao Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yihui Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yingxue Xu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yu Cai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhengjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinrui Jiang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+A">Anjia Han</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Li Liang</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+R+C+K">Ronald Cheong Kin Chan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiguang Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K">Kwang-Ting Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18449v2-abstract-short" style="display: inline;"> Foundation models pretrained on large-scale datasets are revolutionizing the field of computational pathology (CPath). The generalization ability of foundation models is crucial for the success in various downstream clinical tasks. However, current foundation models have only been evaluated on a limited type and number of tasks, leaving their generalization ability and overall performance unclear.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18449v2-abstract-full').style.display = 'inline'; document.getElementById('2407.18449v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18449v2-abstract-full" style="display: none;"> Foundation models pretrained on large-scale datasets are revolutionizing the field of computational pathology (CPath). The generalization ability of foundation models is crucial for the success in various downstream clinical tasks. However, current foundation models have only been evaluated on a limited type and number of tasks, leaving their generalization ability and overall performance unclear. To address this gap, we established a most comprehensive benchmark to evaluate the performance of off-the-shelf foundation models across six distinct clinical task types, encompassing a total of 39 specific tasks. Our findings reveal that existing foundation models excel at certain task types but struggle to effectively handle the full breadth of clinical tasks. To improve the generalization of pathology foundation models, we propose a unified knowledge distillation framework consisting of both expert and self knowledge distillation, where the former allows the model to learn from the knowledge of multiple expert models, while the latter leverages self-distillation to enable image representation learning via local-global alignment. Based on this framework, a Generalizable Pathology Foundation Model (GPFM) is pretrained on a large-scale dataset consisting of 190 million images from around 86,000 public H&E whole slides across 34 major tissue types. Evaluated on the established benchmark, GPFM achieves an impressive average rank of 1.36, with 29 tasks ranked 1st, while the the second-best model, UNI, attains an average rank of 2.96, with only 4 tasks ranked 1st. The superior generalization of GPFM demonstrates its exceptional modeling capabilities across a wide range of clinical tasks, positioning it as a new cornerstone for feature representation in CPath. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18449v2-abstract-full').style.display = 'none'; document.getElementById('2407.18449v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15556">arXiv:2407.15556</a> <span> [<a href="https://arxiv.org/pdf/2407.15556">pdf</a>, <a href="https://arxiv.org/format/2407.15556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SETTP: Style Extraction and Tunable Inference via Dual-level Transferable Prompt Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chunzhen Jin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongfeng Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+P">Peng Cao</a>, <a href="/search/cs?searchtype=author&query=Zaiane%2C+O">Osmar Zaiane</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15556v1-abstract-short" style="display: inline;"> Text style transfer, an important research direction in natural language processing, aims to adapt the text to various preferences but often faces challenges with limited resources. In this work, we introduce a novel method termed Style Extraction and Tunable Inference via Dual-level Transferable Prompt Learning (SETTP) for effective style transfer in low-resource scenarios. First, SETTP learns so… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15556v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15556v1-abstract-full" style="display: none;"> Text style transfer, an important research direction in natural language processing, aims to adapt the text to various preferences but often faces challenges with limited resources. In this work, we introduce a novel method termed Style Extraction and Tunable Inference via Dual-level Transferable Prompt Learning (SETTP) for effective style transfer in low-resource scenarios. First, SETTP learns source style-level prompts containing fundamental style characteristics from high-resource style transfer. During training, the source style-level prompts are transferred through an attention module to derive a target style-level prompt for beneficial knowledge provision in low-resource style transfer. Additionally, we propose instance-level prompts obtained by clustering the target resources based on the semantic content to reduce semantic bias. We also propose an automated evaluation approach of style similarity based on alignment with human evaluations using ChatGPT-4. Our experiments across three resourceful styles show that SETTP requires only 1/20th of the data volume to achieve performance comparable to state-of-the-art methods. In tasks involving scarce data like writing style and role style, SETTP outperforms previous methods by 16.24\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15556v1-abstract-full').style.display = 'none'; document.getElementById('2407.15556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13765">arXiv:2407.13765</a> <span> [<a href="https://arxiv.org/pdf/2407.13765">pdf</a>, <a href="https://arxiv.org/format/2407.13765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Latent Causal Probing: A Formal Perspective on Probing with Causal Models of Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Charles Jin</a>, <a href="/search/cs?searchtype=author&query=Rinard%2C+M">Martin Rinard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13765v2-abstract-short" style="display: inline;"> As language models (LMs) deliver increasing performance on a range of NLP tasks, probing classifiers have become an indispensable technique in the effort to better understand their inner workings. A typical setup involves (1) defining an auxiliary task consisting of a dataset of text annotated with labels, then (2) supervising small classifiers to predict the labels from the representations of a p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13765v2-abstract-full').style.display = 'inline'; document.getElementById('2407.13765v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13765v2-abstract-full" style="display: none;"> As language models (LMs) deliver increasing performance on a range of NLP tasks, probing classifiers have become an indispensable technique in the effort to better understand their inner workings. A typical setup involves (1) defining an auxiliary task consisting of a dataset of text annotated with labels, then (2) supervising small classifiers to predict the labels from the representations of a pretrained LM as it processed the dataset. A high probing accuracy is interpreted as evidence that the LM has learned to perform the auxiliary task as an unsupervised byproduct of its original pretraining objective. Despite the widespread usage of probes, however, the robust design and analysis of probing experiments remains a challenge. We develop a formal perspective on probing using structural causal models (SCM). Specifically, given an SCM which explains the distribution of tokens observed during training, we frame the central hypothesis as whether the LM has learned to represent the latent variables of the SCM. Empirically, we extend a recent study of LMs in the context of a synthetic grid-world navigation task, where having an exact model of the underlying causal structure allows us to draw strong inferences from the result of probing experiments. Our techniques provide robust empirical evidence for the ability of LMs to induce the latent concepts underlying text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13765v2-abstract-full').style.display = 'none'; document.getElementById('2407.13765v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10890">arXiv:2407.10890</a> <span> [<a href="https://arxiv.org/pdf/2407.10890">pdf</a>, <a href="https://arxiv.org/format/2407.10890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Thinking Fast and Slow: Data-Driven Adaptive DeFi Borrow-Lending Protocol </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bastankhah%2C+M">Mahsa Bastankhah</a>, <a href="/search/cs?searchtype=author&query=Nadkarni%2C+V">Viraj Nadkarni</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuechao Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a>, <a href="/search/cs?searchtype=author&query=Kulkarni%2C+S">Sanjeev Kulkarni</a>, <a href="/search/cs?searchtype=author&query=Viswanath%2C+P">Pramod Viswanath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10890v1-abstract-short" style="display: inline;"> Decentralized finance (DeFi) borrowing and lending platforms are crucial to the decentralized economy, involving two main participants: lenders who provide assets for interest and borrowers who offer collateral exceeding their debt and pay interest. Collateral volatility necessitates over-collateralization to protect lenders and ensure competitive returns. Traditional DeFi platforms use a fixed in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10890v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10890v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10890v1-abstract-full" style="display: none;"> Decentralized finance (DeFi) borrowing and lending platforms are crucial to the decentralized economy, involving two main participants: lenders who provide assets for interest and borrowers who offer collateral exceeding their debt and pay interest. Collateral volatility necessitates over-collateralization to protect lenders and ensure competitive returns. Traditional DeFi platforms use a fixed interest rate curve based on the utilization rate (the fraction of available assets borrowed) and determine over-collateralization offline through simulations to manage risk. This method doesn't adapt well to dynamic market changes, such as price fluctuations and evolving user needs, often resulting in losses for lenders or borrowers. In this paper, we introduce an adaptive, data-driven protocol for DeFi borrowing and lending. Our approach includes a high-frequency controller that dynamically adjusts interest rates to maintain market stability and competitiveness with external markets. Unlike traditional protocols, which rely on user reactions and often adjust slowly, our controller uses a learning-based algorithm to quickly find optimal interest rates, reducing the opportunity cost for users during periods of misalignment with external rates. Additionally, we use a low-frequency planner that analyzes user behavior to set an optimal over-collateralization ratio, balancing risk reduction with profit maximization over the long term. This dual approach is essential for adaptive markets: the short-term component maintains market stability, preventing exploitation, while the long-term planner optimizes market parameters to enhance profitability and reduce risks. We provide theoretical guarantees on the convergence rates and adversarial robustness of the short-term component and the long-term effectiveness of our protocol. Empirical validation confirms our protocol's theoretical benefits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10890v1-abstract-full').style.display = 'none'; document.getElementById('2407.10890v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05875">arXiv:2407.05875</a> <span> [<a href="https://arxiv.org/pdf/2407.05875">pdf</a>, <a href="https://arxiv.org/format/2407.05875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Minutes to Seconds: Speeded-up DDPM-based Image Inpainting with Coarse-to-Fine Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lintao Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiangcheng Du</a>, <a href="/search/cs?searchtype=author&query=TomyEnrique%2C+L">LeoWu TomyEnrique</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiqun Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yingbin Zheng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05875v1-abstract-short" style="display: inline;"> For image inpainting, the existing Denoising Diffusion Probabilistic Model (DDPM) based method i.e. RePaint can produce high-quality images for any inpainting form. It utilizes a pre-trained DDPM as a prior and generates inpainting results by conditioning on the reverse diffusion process, namely denoising process. However, this process is significantly time-consuming. In this paper, we propose an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05875v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05875v1-abstract-full" style="display: none;"> For image inpainting, the existing Denoising Diffusion Probabilistic Model (DDPM) based method i.e. RePaint can produce high-quality images for any inpainting form. It utilizes a pre-trained DDPM as a prior and generates inpainting results by conditioning on the reverse diffusion process, namely denoising process. However, this process is significantly time-consuming. In this paper, we propose an efficient DDPM-based image inpainting method which includes three speed-up strategies. First, we utilize a pre-trained Light-Weight Diffusion Model (LWDM) to reduce the number of parameters. Second, we introduce a skip-step sampling scheme of Denoising Diffusion Implicit Models (DDIM) for the denoising process. Finally, we propose Coarse-to-Fine Sampling (CFS), which speeds up inference by reducing image resolution in the coarse stage and decreasing denoising timesteps in the refinement stage. We conduct extensive experiments on both faces and general-purpose image inpainting tasks, and our method achieves competitive performance with approximately 60 times speedup. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05875v1-abstract-full').style.display = 'none'; document.getElementById('2407.05875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code is avaliable at: https://github.com/linghuyuhangyuan/M2S</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02769">arXiv:2407.02769</a> <span> [<a href="https://arxiv.org/pdf/2407.02769">pdf</a>, <a href="https://arxiv.org/format/2407.02769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fine-Grained Scene Image Classification with Modality-Agnostic Adapter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiqun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhao Zhou</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiangcheng Du</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xingjiao Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yingbin Zheng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02769v1-abstract-short" style="display: inline;"> When dealing with the task of fine-grained scene image classification, most previous works lay much emphasis on global visual features when doing multi-modal feature fusion. In other words, models are deliberately designed based on prior intuitions about the importance of different modalities. In this paper, we present a new multi-modal feature fusion approach named MAA (Modality-Agnostic Adapter)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02769v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02769v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02769v1-abstract-full" style="display: none;"> When dealing with the task of fine-grained scene image classification, most previous works lay much emphasis on global visual features when doing multi-modal feature fusion. In other words, models are deliberately designed based on prior intuitions about the importance of different modalities. In this paper, we present a new multi-modal feature fusion approach named MAA (Modality-Agnostic Adapter), trying to make the model learn the importance of different modalities in different cases adaptively, without giving a prior setting in the model architecture. More specifically, we eliminate the modal differences in distribution and then use a modality-agnostic Transformer encoder for a semantic-level feature fusion. Our experiments demonstrate that MAA achieves state-of-the-art results on benchmarks by applying the same modalities with previous methods. Besides, it is worth mentioning that new modalities can be easily added when using MAA and further boost the performance. Code is available at https://github.com/quniLcs/MAA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02769v1-abstract-full').style.display = 'none'; document.getElementById('2407.02769v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00336">arXiv:2407.00336</a> <span> [<a href="https://arxiv.org/pdf/2407.00336">pdf</a>, <a href="https://arxiv.org/format/2407.00336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dual-view Aware Smart Contract Vulnerability Detection for Ethereum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jiacheng Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Maolin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wanqi Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chengxiang Jin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiajun Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shanqing Yu</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+Q">Qi Xuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00336v1-abstract-short" style="display: inline;"> The wide application of Ethereum technology has brought technological innovation to traditional industries. As one of Ethereum's core applications, smart contracts utilize diverse contract codes to meet various functional needs and have gained widespread use. However, the non-tamperability of smart contracts, coupled with vulnerabilities caused by natural flaws or human errors, has brought unprece… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00336v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00336v1-abstract-full" style="display: none;"> The wide application of Ethereum technology has brought technological innovation to traditional industries. As one of Ethereum's core applications, smart contracts utilize diverse contract codes to meet various functional needs and have gained widespread use. However, the non-tamperability of smart contracts, coupled with vulnerabilities caused by natural flaws or human errors, has brought unprecedented challenges to blockchain security. Therefore, in order to ensure the healthy development of blockchain technology and the stability of the blockchain community, it is particularly important to study the vulnerability detection techniques for smart contracts. In this paper, we propose a Dual-view Aware Smart Contract Vulnerability Detection Framework named DVDet. The framework initially converts the source code and bytecode of smart contracts into weighted graphs and control flow sequences, capturing potential risk features from these two perspectives and integrating them for analysis, ultimately achieving effective contract vulnerability detection. Comprehensive experiments on the Ethereum dataset show that our method outperforms others in detecting vulnerabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00336v1-abstract-full').style.display = 'none'; document.getElementById('2407.00336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by International Conference on Blockchain and Trustworthy Systems 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14449">arXiv:2406.14449</a> <span> [<a href="https://arxiv.org/pdf/2406.14449">pdf</a>, <a href="https://arxiv.org/format/2406.14449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> APEER: Automatic Prompt Engineering Enhances Large Language Model Reranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Can Jin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hongwu Peng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wujiang Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiahui Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+K">Kai Zhong</a>, <a href="/search/cs?searchtype=author&query=Rajasekaran%2C+S">Sanguthevar Rajasekaran</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D+N">Dimitris N. Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14449v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have significantly enhanced Information Retrieval (IR) across various modules, such as reranking. Despite impressive performance, current zero-shot relevance ranking with LLMs heavily relies on human prompt engineering. Existing automatic prompt engineering algorithms primarily focus on language modeling and classification tasks, leaving the domain of IR, particularly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14449v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14449v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have significantly enhanced Information Retrieval (IR) across various modules, such as reranking. Despite impressive performance, current zero-shot relevance ranking with LLMs heavily relies on human prompt engineering. Existing automatic prompt engineering algorithms primarily focus on language modeling and classification tasks, leaving the domain of IR, particularly reranking, underexplored. Directly applying current prompt engineering algorithms to relevance ranking is challenging due to the integration of query and long passage pairs in the input, where the ranking complexity surpasses classification tasks. To reduce human effort and unlock the potential of prompt optimization in reranking, we introduce a novel automatic prompt engineering algorithm named APEER. APEER iteratively generates refined prompts through feedback and preference optimization. Extensive experiments with four LLMs and ten datasets demonstrate the substantial performance improvement of APEER over existing state-of-the-art (SoTA) manual prompts. Furthermore, we find that the prompts generated by APEER exhibit better transferability across diverse tasks and LLMs. Code is available at https://github.com/jincan333/APEER. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14449v1-abstract-full').style.display = 'none'; document.getElementById('2406.14449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08187">arXiv:2406.08187</a> <span> [<a href="https://arxiv.org/pdf/2406.08187">pdf</a>, <a href="https://arxiv.org/format/2406.08187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning-based Traversability Costmap for Autonomous Off-road Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiumin Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhen Sun</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Songpengcheng Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guoqing Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kehui Ma</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+L">Ling Pei</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08187v2-abstract-short" style="display: inline;"> Traversability estimation in off-road terrains is an essential procedure for autonomous navigation. However, creating reliable labels for complex interactions between the robot and the surface is still a challenging problem in learning-based costmap generation. To address this, we propose a method that predicts traversability costmaps by leveraging both visual and geometric information of the envi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08187v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08187v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08187v2-abstract-full" style="display: none;"> Traversability estimation in off-road terrains is an essential procedure for autonomous navigation. However, creating reliable labels for complex interactions between the robot and the surface is still a challenging problem in learning-based costmap generation. To address this, we propose a method that predicts traversability costmaps by leveraging both visual and geometric information of the environment. To quantify the surface properties like roughness and bumpiness, we introduce a novel way of risk-aware labelling with proprioceptive information for network training. We validate our method in costmap prediction and navigation tasks for complex off-road scenarios. Our results demonstrate that our costmap prediction method excels in terms of average accuracy and MSE. The navigation results indicate that using our learned costmaps leads to safer and smoother driving, outperforming previous methods in terms of the highest success rate, lowest normalized trajectory length, lowest time cost, and highest mean stability across two scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08187v2-abstract-full').style.display = 'none'; document.getElementById('2406.08187v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07131">arXiv:2406.07131</a> <span> [<a href="https://arxiv.org/pdf/2406.07131">pdf</a>, <a href="https://arxiv.org/format/2406.07131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ICGAN: An implicit conditioning method for interpretable feature control of neural audio synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunyi Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Craig Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07131v1-abstract-short" style="display: inline;"> Neural audio synthesis methods can achieve high-fidelity and realistic sound generation by utilizing deep generative models. Such models typically rely on external labels which are often discrete as conditioning information to achieve guided sound generation. However, it remains difficult to control the subtle changes in sounds without appropriate and descriptive labels, especially given a limited… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07131v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07131v1-abstract-full" style="display: none;"> Neural audio synthesis methods can achieve high-fidelity and realistic sound generation by utilizing deep generative models. Such models typically rely on external labels which are often discrete as conditioning information to achieve guided sound generation. However, it remains difficult to control the subtle changes in sounds without appropriate and descriptive labels, especially given a limited dataset. This paper proposes an implicit conditioning method for neural audio synthesis using generative adversarial networks that allows for interpretable control of the acoustic features of synthesized sounds. Our technique creates a continuous conditioning space that enables timbre manipulation without relying on explicit labels. We further introduce an evaluation metric to explore controllability and demonstrate that our approach is effective in enabling a degree of controlled variation of different synthesized sound effects for in-domain and cross-domain sounds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07131v1-abstract-full').style.display = 'none'; document.getElementById('2406.07131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06959">arXiv:2406.06959</a> <span> [<a href="https://arxiv.org/pdf/2406.06959">pdf</a>, <a href="https://arxiv.org/format/2406.06959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Denoising Capability of Diffusion Prior for Solving Inverse Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+J">Jiaxin Zhuang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gen Li</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuantao Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06959v2-abstract-short" style="display: inline;"> The recent emergence of diffusion models has significantly advanced the precision of learnable priors, presenting innovative avenues for addressing inverse problems. Since inverse problems inherently entail maximum a posteriori estimation, previous works have endeavored to integrate diffusion priors into the optimization frameworks. However, prevailing optimization-based inverse algorithms primari… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06959v2-abstract-full').style.display = 'inline'; document.getElementById('2406.06959v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06959v2-abstract-full" style="display: none;"> The recent emergence of diffusion models has significantly advanced the precision of learnable priors, presenting innovative avenues for addressing inverse problems. Since inverse problems inherently entail maximum a posteriori estimation, previous works have endeavored to integrate diffusion priors into the optimization frameworks. However, prevailing optimization-based inverse algorithms primarily exploit the prior information within the diffusion models while neglecting their denoising capability. To bridge this gap, this work leverages the diffusion process to reframe noisy inverse problems as a two-variable constrained optimization task by introducing an auxiliary optimization variable. By employing gradient truncation, the projection gradient descent method is efficiently utilized to solve the corresponding optimization problem. The proposed algorithm, termed ProjDiff, effectively harnesses the prior information and the denoising capability of a pre-trained diffusion model within the optimization framework. Extensive experiments on the image restoration tasks and source separation and partial generation tasks demonstrate that ProjDiff exhibits superior performance across various linear and nonlinear inverse problems, highlighting its potential for practical applications. Code is available at https://github.com/weigerzan/ProjDiff/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06959v2-abstract-full').style.display = 'none'; document.getElementById('2406.06959v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jin%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Jin%2C+C&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository