Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 579 results for author: <span class="mathjax">Xia, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xia%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xia, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xia%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xia, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xia%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13595">arXiv:2502.13595</a> <span> [<a href="https://arxiv.org/pdf/2502.13595">pdf</a>, <a href="https://arxiv.org/format/2502.13595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MMTEB: Massive Multilingual Text Embedding Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Enevoldsen%2C+K">Kenneth Enevoldsen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+I">Isaac Chung</a>, <a href="/search/cs?searchtype=author&query=Kerboua%2C+I">Imene Kerboua</a>, <a href="/search/cs?searchtype=author&query=Kardos%2C+M">M谩rton Kardos</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+A">Ashwin Mathur</a>, <a href="/search/cs?searchtype=author&query=Stap%2C+D">David Stap</a>, <a href="/search/cs?searchtype=author&query=Gala%2C+J">Jay Gala</a>, <a href="/search/cs?searchtype=author&query=Siblini%2C+W">Wissam Siblini</a>, <a href="/search/cs?searchtype=author&query=Krzemi%C5%84ski%2C+D">Dominik Krzemi艅ski</a>, <a href="/search/cs?searchtype=author&query=Winata%2C+G+I">Genta Indra Winata</a>, <a href="/search/cs?searchtype=author&query=Sturua%2C+S">Saba Sturua</a>, <a href="/search/cs?searchtype=author&query=Utpala%2C+S">Saiteja Utpala</a>, <a href="/search/cs?searchtype=author&query=Ciancone%2C+M">Mathieu Ciancone</a>, <a href="/search/cs?searchtype=author&query=Schaeffer%2C+M">Marion Schaeffer</a>, <a href="/search/cs?searchtype=author&query=Sequeira%2C+G">Gabriel Sequeira</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+D">Diganta Misra</a>, <a href="/search/cs?searchtype=author&query=Dhakal%2C+S">Shreeya Dhakal</a>, <a href="/search/cs?searchtype=author&query=Rystr%C3%B8m%2C+J">Jonathan Rystr酶m</a>, <a href="/search/cs?searchtype=author&query=Solomatin%2C+R">Roman Solomatin</a>, <a href="/search/cs?searchtype=author&query=%C3%87a%C4%9Fatan%2C+%C3%96">脰mer 脟a臒atan</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+A">Akash Kundu</a>, <a href="/search/cs?searchtype=author&query=Bernstorff%2C+M">Martin Bernstorff</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Sukhlecha%2C+A">Akshita Sukhlecha</a>, <a href="/search/cs?searchtype=author&query=Pahwa%2C+B">Bhavish Pahwa</a> , et al. (61 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13595v1-abstract-short" style="display: inline;"> Text embeddings are typically evaluated on a limited set of tasks, which are constrained by language, domain, and task diversity. To address these limitations and provide a more comprehensive evaluation, we introduce the Massive Multilingual Text Embedding Benchmark (MMTEB) - a large-scale, community-driven expansion of MTEB, covering over 500 quality-controlled evaluation tasks across 250+ langua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13595v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13595v1-abstract-full" style="display: none;"> Text embeddings are typically evaluated on a limited set of tasks, which are constrained by language, domain, and task diversity. To address these limitations and provide a more comprehensive evaluation, we introduce the Massive Multilingual Text Embedding Benchmark (MMTEB) - a large-scale, community-driven expansion of MTEB, covering over 500 quality-controlled evaluation tasks across 250+ languages. MMTEB includes a diverse set of challenging, novel tasks such as instruction following, long-document retrieval, and code retrieval, representing the largest multilingual collection of evaluation tasks for embedding models to date. Using this collection, we develop several highly multilingual benchmarks, which we use to evaluate a representative set of models. We find that while large language models (LLMs) with billions of parameters can achieve state-of-the-art performance on certain language subsets and task categories, the best-performing publicly available model is multilingual-e5-large-instruct with only 560 million parameters. To facilitate accessibility and reduce computational cost, we introduce a novel downsampling method based on inter-task correlation, ensuring a diverse selection while preserving relative model rankings. Furthermore, we optimize tasks such as retrieval by sampling hard negatives, creating smaller but effective splits. These optimizations allow us to introduce benchmarks that drastically reduce computational demands. For instance, our newly introduced zero-shot English benchmark maintains a ranking order similar to the full-scale version but at a fraction of the computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13595v1-abstract-full').style.display = 'none'; document.getElementById('2502.13595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for ICLR: https://openreview.net/forum?id=zl3pfz4VCV</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12917">arXiv:2502.12917</a> <span> [<a href="https://arxiv.org/pdf/2502.12917">pdf</a>, <a href="https://arxiv.org/format/2502.12917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contrast-Unity for Partially-Supervised Temporal Sentence Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+C">Chen Ju</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weixiong Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chaofan Ma</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuai Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ya Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12917v1-abstract-short" style="display: inline;"> Temporal sentence grounding aims to detect event timestamps described by the natural language query from given untrimmed videos. The existing fully-supervised setting achieves great results but requires expensive annotation costs; while the weakly-supervised setting adopts cheap labels but performs poorly. To pursue high performance with less annotation costs, this paper introduces an intermediate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12917v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12917v1-abstract-full" style="display: none;"> Temporal sentence grounding aims to detect event timestamps described by the natural language query from given untrimmed videos. The existing fully-supervised setting achieves great results but requires expensive annotation costs; while the weakly-supervised setting adopts cheap labels but performs poorly. To pursue high performance with less annotation costs, this paper introduces an intermediate partially-supervised setting, i.e., only short-clip is available during training. To make full use of partial labels, we specially design one contrast-unity framework, with the two-stage goal of implicit-explicit progressive grounding. In the implicit stage, we align event-query representations at fine granularity using comprehensive quadruple contrastive learning: event-query gather, event-background separation, intra-cluster compactness and inter-cluster separability. Then, high-quality representations bring acceptable grounding pseudo-labels. In the explicit stage, to explicitly optimize grounding objectives, we train one fully-supervised model using obtained pseudo-labels for grounding refinement and denoising. Extensive experiments and thoroughly ablations on Charades-STA and ActivityNet Captions demonstrate the significance of partial supervision, as well as our superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12917v1-abstract-full').style.display = 'none'; document.getElementById('2502.12917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025.The first two authors share the same contribution. arXiv admin note: text overlap with arXiv:2302.09850</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12535">arXiv:2502.12535</a> <span> [<a href="https://arxiv.org/pdf/2502.12535">pdf</a>, <a href="https://arxiv.org/format/2502.12535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Transformation-Isomorphic Latent Space for Accurate Hand Pose Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+K">Kaiwen Ren</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lei Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yongjing Ye</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shihong Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12535v1-abstract-short" style="display: inline;"> Vision-based regression tasks, such as hand pose estimation, have achieved higher accuracy and faster convergence through representation learning. However, existing representation learning methods often encounter the following issues: the high semantic level of features extracted from images is inadequate for regressing low-level information, and the extracted features include task-irrelevant info… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12535v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12535v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12535v1-abstract-full" style="display: none;"> Vision-based regression tasks, such as hand pose estimation, have achieved higher accuracy and faster convergence through representation learning. However, existing representation learning methods often encounter the following issues: the high semantic level of features extracted from images is inadequate for regressing low-level information, and the extracted features include task-irrelevant information, reducing their compactness and interfering with regression tasks. To address these challenges, we propose TI-Net, a highly versatile visual Network backbone designed to construct a Transformation Isomorphic latent space. Specifically, we employ linear transformations to model geometric transformations in the latent space and ensure that {\rm TI-Net} aligns them with those in the image space. This ensures that the latent features capture compact, low-level information beneficial for pose estimation tasks. We evaluated TI-Net on the hand pose estimation task to demonstrate the network's superiority. On the DexYCB dataset, TI-Net achieved a 10% improvement in the PA-MPJPE metric compared to specialized state-of-the-art (SOTA) hand pose estimation methods. Our code will be released in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12535v1-abstract-full').style.display = 'none'; document.getElementById('2502.12535v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12534">arXiv:2502.12534</a> <span> [<a href="https://arxiv.org/pdf/2502.12534">pdf</a>, <a href="https://arxiv.org/format/2502.12534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NoKSR: Kernel-Free Neural Surface Reconstruction via Point Cloud Serialization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weiwei Sun</a>, <a href="/search/cs?searchtype=author&query=Govindarajan%2C+S">Shrisudhan Govindarajan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shaobo Xia</a>, <a href="/search/cs?searchtype=author&query=Rebain%2C+D">Daniel Rebain</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+K+M">Kwang Moo Yi</a>, <a href="/search/cs?searchtype=author&query=Tagliasacchi%2C+A">Andrea Tagliasacchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12534v2-abstract-short" style="display: inline;"> We present a novel approach to large-scale point cloud surface reconstruction by developing an efficient framework that converts an irregular point cloud into a signed distance field (SDF). Our backbone builds upon recent transformer-based architectures (i.e., PointTransformerV3), that serializes the point cloud into a locality-preserving sequence of tokens. We efficiently predict the SDF value at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12534v2-abstract-full').style.display = 'inline'; document.getElementById('2502.12534v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12534v2-abstract-full" style="display: none;"> We present a novel approach to large-scale point cloud surface reconstruction by developing an efficient framework that converts an irregular point cloud into a signed distance field (SDF). Our backbone builds upon recent transformer-based architectures (i.e., PointTransformerV3), that serializes the point cloud into a locality-preserving sequence of tokens. We efficiently predict the SDF value at a point by aggregating nearby tokens, where fast approximate neighbors can be retrieved thanks to the serialization. We serialize the point cloud at different levels/scales, and non-linearly aggregate a feature to predict the SDF value. We show that aggregating across multiple scales is critical to overcome the approximations introduced by the serialization (i.e. false negatives in the neighborhood). Our frameworks sets the new state-of-the-art in terms of accuracy and efficiency (better or similar performance with half the latency of the best prior method, coupled with a simpler implementation), particularly on outdoor datasets where sparse-grid methods have shown limited performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12534v2-abstract-full').style.display = 'none'; document.getElementById('2502.12534v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: see https://theialab.github.io/noksr/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10669">arXiv:2502.10669</a> <span> [<a href="https://arxiv.org/pdf/2502.10669">pdf</a>, <a href="https://arxiv.org/format/2502.10669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Is Self-Supervised Pre-training on Satellite Imagery Better than ImageNet? A Systematic Study with Sentinel-2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lahrichi%2C+S">Saad Lahrichi</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+Z">Zion Sheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shufan Xia</a>, <a href="/search/cs?searchtype=author&query=Bradbury%2C+K">Kyle Bradbury</a>, <a href="/search/cs?searchtype=author&query=Malof%2C+J">Jordan Malof</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10669v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) has demonstrated significant potential in pre-training robust models with limited labeled data, making it particularly valuable for remote sensing (RS) tasks. A common assumption is that pre-training on domain-aligned data provides maximal benefits on downstream tasks, particularly when compared to ImageNet-pretraining (INP). In this work, we investigate this assumpt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10669v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10669v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) has demonstrated significant potential in pre-training robust models with limited labeled data, making it particularly valuable for remote sensing (RS) tasks. A common assumption is that pre-training on domain-aligned data provides maximal benefits on downstream tasks, particularly when compared to ImageNet-pretraining (INP). In this work, we investigate this assumption by collecting GeoNet, a large and diverse dataset of global optical Sentinel-2 imagery, and pre-training SwAV and MAE on both GeoNet and ImageNet. Evaluating these models on six downstream tasks in the few-shot setting reveals that SSL pre-training on RS data offers modest performance improvements over INP, and that it remains competitive in multiple scenarios. This indicates that the presumed benefits of SSL pre-training on RS data may be overstated, and the additional costs of data curation and pre-training could be unjustified. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10669v1-abstract-full').style.display = 'none'; document.getElementById('2502.10669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07644">arXiv:2502.07644</a> <span> [<a href="https://arxiv.org/pdf/2502.07644">pdf</a>, <a href="https://arxiv.org/format/2502.07644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SymGPT: Auditing Smart Contracts via Combining Symbolic Execution with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shihao Xia</a>, <a href="/search/cs?searchtype=author&query=He%2C+M">Mengting He</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+S">Shuai Shao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tingting Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiying Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linhai Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07644v2-abstract-short" style="display: inline;"> To govern smart contracts running on Ethereum, multiple Ethereum Request for Comment (ERC) standards have been developed, each having a set of rules to guide the behaviors of smart contracts. Violating the ERC rules could cause serious security issues and financial loss, signifying the importance of verifying smart contracts follow ERCs. Today's practices of such verification are to manually audit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07644v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07644v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07644v2-abstract-full" style="display: none;"> To govern smart contracts running on Ethereum, multiple Ethereum Request for Comment (ERC) standards have been developed, each having a set of rules to guide the behaviors of smart contracts. Violating the ERC rules could cause serious security issues and financial loss, signifying the importance of verifying smart contracts follow ERCs. Today's practices of such verification are to manually audit each single contract, use expert-developed program-analysis tools, or use large language models (LLMs), all of which are far from effective in identifying ERC rule violations. This paper introduces SymGPT, a tool that combines the natural language understanding of large language models (LLMs) with the formal guarantees of symbolic execution to automatically verify smart contracts' compliance with ERC rules. To develop SymGPT, we conduct an empirical study of 132 ERC rules from three widely used ERC standards, examining their content, security implications, and natural language descriptions. Based on this study, we design SymGPT by first instructing an LLM to translate ERC rules into a defined EBNF grammar. We then synthesize constraints from the formalized rules to represent scenarios where violations may occur and use symbolic execution to detect them. Our evaluation shows that SymGPT identifies 5,783 ERC rule violations in 4,000 real-world contracts, including 1,375 violations with clear attack paths for stealing financial assets, demonstrating its effectiveness. Furthermore, SymGPT outperforms six automated techniques and a security-expert auditing service, underscoring its superiority over current smart contract analysis methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07644v2-abstract-full').style.display = 'none'; document.getElementById('2502.07644v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages. arXiv admin note: text overlap with arXiv:2404.04306</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06707">arXiv:2502.06707</a> <span> [<a href="https://arxiv.org/pdf/2502.06707">pdf</a>, <a href="https://arxiv.org/format/2502.06707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> FinMamba: Market-Aware Graph Enhanced Multi-Level Mamba for Stock Movement Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yifan Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuante Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Dawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Naiqi Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+J">Jigang Bao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06707v1-abstract-short" style="display: inline;"> Recently, combining stock features with inter-stock correlations has become a common and effective approach for stock movement prediction. However, financial data presents significant challenges due to its low signal-to-noise ratio and the dynamic complexity of the market, which give rise to two key limitations in existing methods. First, the relationships between stocks are highly influenced by m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06707v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06707v1-abstract-full" style="display: none;"> Recently, combining stock features with inter-stock correlations has become a common and effective approach for stock movement prediction. However, financial data presents significant challenges due to its low signal-to-noise ratio and the dynamic complexity of the market, which give rise to two key limitations in existing methods. First, the relationships between stocks are highly influenced by multifaceted factors including macroeconomic market dynamics, and current models fail to adaptively capture these evolving interactions under specific market conditions. Second, for the accuracy and timeliness required by real-world trading, existing financial data mining methods struggle to extract beneficial pattern-oriented dependencies from long historical data while maintaining high efficiency and low memory consumption. To address the limitations, we propose FinMamba, a Mamba-GNN-based framework for market-aware and multi-level hybrid stock movement prediction. Specifically, we devise a dynamic graph to learn the changing representations of inter-stock relationships by integrating a pruning module that adapts to market trends. Afterward, with a selective mechanism, the multi-level Mamba discards irrelevant information and resets states to skillfully recall historical patterns across multiple time scales with linear time costs, which are then jointly optimized for reliable prediction. Extensive experiments on U.S. and Chinese stock markets demonstrate the effectiveness of our proposed FinMamba, achieving state-of-the-art prediction accuracy and trading profitability, while maintaining low computational complexity. The code is available at https://github.com/TROUBADOUR000/FinMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06707v1-abstract-full').style.display = 'none'; document.getElementById('2502.06707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06663">arXiv:2502.06663</a> <span> [<a href="https://arxiv.org/pdf/2502.06663">pdf</a>, <a href="https://arxiv.org/format/2502.06663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EfficientLLM: Scalable Pruning-Aware Pretraining for Architecture-Agnostic Edge Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xingrun Xing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Boyan Gao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wanpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haokun Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06663v2-abstract-short" style="display: inline;"> Modern large language models (LLMs) driven by scaling laws, achieve intelligence emergency in large model sizes. Recently, the increasing concerns about cloud costs, latency, and privacy make it an urgent requirement to develop compact edge language models. Distinguished from direct pretraining that bounded by the scaling law, this work proposes the pruning-aware pretraining, focusing on retaining… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06663v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06663v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06663v2-abstract-full" style="display: none;"> Modern large language models (LLMs) driven by scaling laws, achieve intelligence emergency in large model sizes. Recently, the increasing concerns about cloud costs, latency, and privacy make it an urgent requirement to develop compact edge language models. Distinguished from direct pretraining that bounded by the scaling law, this work proposes the pruning-aware pretraining, focusing on retaining performance of much larger optimized models. It features following characteristics: 1) Data-scalable: we introduce minimal parameter groups in LLM and continuously optimize structural pruning, extending post-training pruning methods like LLM-Pruner and SparseGPT into the pretraining phase. 2) Architecture-agnostic: the LLM architecture is auto-designed using saliency-driven pruning, which is the first time to exceed SoTA human-designed LLMs in modern pretraining. We reveal that it achieves top-quality edge language models, termed EfficientLLM, by scaling up LLM compression and extending its boundary. EfficientLLM significantly outperforms SoTA baselines with $100M \sim 1B$ parameters, such as MobileLLM, SmolLM, Qwen2.5-0.5B, OLMo-1B, Llama3.2-1B in common sense benchmarks. As the first attempt, EfficientLLM bridges the performance gap between traditional LLM compression and direct pretraining methods, and we will fully open source at https://github.com/Xingrun-Xing2/EfficientLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06663v2-abstract-full').style.display = 'none'; document.getElementById('2502.06663v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06134">arXiv:2502.06134</a> <span> [<a href="https://arxiv.org/pdf/2502.06134">pdf</a>, <a href="https://arxiv.org/format/2502.06134">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Integrating Sequence and Image Modeling in Irregular Medical Time Series Through Self-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuqing Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuhong Xiao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shixian Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shanhai Hu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingyun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06134v1-abstract-short" style="display: inline;"> Medical time series are often irregular and face significant missingness, posing challenges for data analysis and clinical decision-making. Existing methods typically adopt a single modeling perspective, either treating series data as sequences or transforming them into image representations for further classification. In this paper, we propose a joint learning framework that incorporates both seq… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06134v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06134v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06134v1-abstract-full" style="display: none;"> Medical time series are often irregular and face significant missingness, posing challenges for data analysis and clinical decision-making. Existing methods typically adopt a single modeling perspective, either treating series data as sequences or transforming them into image representations for further classification. In this paper, we propose a joint learning framework that incorporates both sequence and image representations. We also design three self-supervised learning strategies to facilitate the fusion of sequence and image representations, capturing a more generalizable joint representation. The results indicate that our approach outperforms seven other state-of-the-art models in three representative real-world clinical datasets. We further validate our approach by simulating two major types of real-world missingness through leave-sensors-out and leave-samples-out techniques. The results demonstrate that our approach is more robust and significantly surpasses other baselines in terms of classification performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06134v1-abstract-full').style.display = 'none'; document.getElementById('2502.06134v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures, AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03682">arXiv:2502.03682</a> <span> [<a href="https://arxiv.org/pdf/2502.03682">pdf</a>, <a href="https://arxiv.org/format/2502.03682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Towards Scalable Defenses against Intimate Partner Infiltrations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Weisi Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shinan Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Feng Xiao</a>, <a href="/search/cs?searchtype=author&query=Feamster%2C+N">Nick Feamster</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Stephen Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03682v1-abstract-short" style="display: inline;"> Intimate Partner Infiltration (IPI)--a type of Intimate Partner Violence (IPV) that typically requires physical access to a victim's device--is a pervasive concern in the United States, often manifesting through digital surveillance, control, and monitoring. Unlike conventional cyberattacks, IPI perpetrators leverage close proximity and personal knowledge to circumvent standard protections, unders… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03682v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03682v1-abstract-full" style="display: none;"> Intimate Partner Infiltration (IPI)--a type of Intimate Partner Violence (IPV) that typically requires physical access to a victim's device--is a pervasive concern in the United States, often manifesting through digital surveillance, control, and monitoring. Unlike conventional cyberattacks, IPI perpetrators leverage close proximity and personal knowledge to circumvent standard protections, underscoring the need for targeted interventions. While security clinics and other human-centered approaches effectively tailor solutions for survivors, their scalability remains constrained by resource limitations and the need for specialized counseling. In this paper, we present AID, an Automated IPI Detection system that continuously monitors for unauthorized access and suspicious behaviors on smartphones. AID employs a two-stage architecture to process multimodal signals stealthily and preserve user privacy. A brief calibration phase upon installation enables AID to adapt to each user's behavioral patterns, achieving high accuracy with minimal false alarms. Our 27-participant user study demonstrates that AID achieves highly accurate detection of non-owner access and fine-grained IPI-related activities, attaining an end-to-end top-3 F1 score of 0.981 with a false positive rate of 4%. These findings suggest that AID can serve as a forensic tool within security clinics, scaling their ability to identify IPI tactics and deliver personalized, far-reaching support to survivors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03682v1-abstract-full').style.display = 'none'; document.getElementById('2502.03682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03387">arXiv:2502.03387</a> <span> [<a href="https://arxiv.org/pdf/2502.03387">pdf</a>, <a href="https://arxiv.org/format/2502.03387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LIMO: Less is More for Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yixin Ye</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhen Huang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yang Xiao</a>, <a href="/search/cs?searchtype=author&query=Chern%2C+E">Ethan Chern</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shijie Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03387v1-abstract-short" style="display: inline;"> We present a fundamental discovery that challenges our understanding of how complex reasoning emerges in large language models. While conventional wisdom suggests that sophisticated reasoning tasks demand extensive training data (>100,000 examples), we demonstrate that complex mathematical reasoning abilities can be effectively elicited with surprisingly few examples. Through comprehensive experim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03387v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03387v1-abstract-full" style="display: none;"> We present a fundamental discovery that challenges our understanding of how complex reasoning emerges in large language models. While conventional wisdom suggests that sophisticated reasoning tasks demand extensive training data (>100,000 examples), we demonstrate that complex mathematical reasoning abilities can be effectively elicited with surprisingly few examples. Through comprehensive experiments, our proposed model LIMO demonstrates unprecedented performance in mathematical reasoning. With merely 817 curated training samples, LIMO achieves 57.1% accuracy on AIME and 94.8% on MATH, improving from previous SFT-based models' 6.5% and 59.2% respectively, while only using 1% of the training data required by previous approaches. LIMO demonstrates exceptional out-of-distribution generalization, achieving 40.5% absolute improvement across 10 diverse benchmarks, outperforming models trained on 100x more data, challenging the notion that SFT leads to memorization rather than generalization. Based on these results, we propose the Less-Is-More Reasoning Hypothesis (LIMO Hypothesis): In foundation models where domain knowledge has been comprehensively encoded during pre-training, sophisticated reasoning capabilities can emerge through minimal but precisely orchestrated demonstrations of cognitive processes. This hypothesis posits that the elicitation threshold for complex reasoning is determined by two key factors: (1) the completeness of the model's encoded knowledge foundation during pre-training, and (2) the effectiveness of post-training examples as "cognitive templates" that show the model how to utilize its knowledge base to solve complex reasoning tasks. To facilitate reproducibility and future research in data-efficient reasoning, we release LIMO as a comprehensive open-source suite at https://github.com/GAIR-NLP/LIMO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03387v1-abstract-full').style.display = 'none'; document.getElementById('2502.03387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00818">arXiv:2502.00818</a> <span> [<a href="https://arxiv.org/pdf/2502.00818">pdf</a>, <a href="https://arxiv.org/format/2502.00818">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Error-quantified Conformal Inference for Time Series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junxi Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dongjian Hu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yajie Bao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+C">Changliang Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00818v1-abstract-short" style="display: inline;"> Uncertainty quantification in time series prediction is challenging due to the temporal dependence and distribution shift on sequential data. Conformal inference provides a pivotal and flexible instrument for assessing the uncertainty of machine learning models through prediction sets. Recently, a series of online conformal inference methods updated thresholds of prediction sets by performing onli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00818v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00818v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00818v1-abstract-full" style="display: none;"> Uncertainty quantification in time series prediction is challenging due to the temporal dependence and distribution shift on sequential data. Conformal inference provides a pivotal and flexible instrument for assessing the uncertainty of machine learning models through prediction sets. Recently, a series of online conformal inference methods updated thresholds of prediction sets by performing online gradient descent on a sequence of quantile loss functions. A drawback of such methods is that they only use the information of revealed non-conformity scores via miscoverage indicators but ignore error quantification, namely the distance between the non-conformity score and the current threshold. To accurately leverage the dynamic of miscoverage error, we propose \textit{Error-quantified Conformal Inference} (ECI) by smoothing the quantile loss function. ECI introduces a continuous and adaptive feedback scale with the miscoverage error, rather than simple binary feedback in existing methods. We establish a long-term coverage guarantee for ECI under arbitrary dependence and distribution shift. The extensive experimental results show that ECI can achieve valid miscoverage control and output tighter prediction sets than other baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00818v1-abstract-full').style.display = 'none'; document.getElementById('2502.00818v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025 camera version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18413">arXiv:2501.18413</a> <span> [<a href="https://arxiv.org/pdf/2501.18413">pdf</a>, <a href="https://arxiv.org/format/2501.18413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GBFRS: Robust Fuzzy Rough Sets via Granular-ball Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuyin Xia</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+X">Xiaoyu Lian</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+B">Binbin Sang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoyin Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18413v1-abstract-short" style="display: inline;"> Fuzzy rough set theory is effective for processing datasets with complex attributes, supported by a solid mathematical foundation and closely linked to kernel methods in machine learning. Attribute reduction algorithms and classifiers based on fuzzy rough set theory exhibit promising performance in the analysis of high-dimensional multivariate complex data. However, most existing models operate at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18413v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18413v1-abstract-full" style="display: none;"> Fuzzy rough set theory is effective for processing datasets with complex attributes, supported by a solid mathematical foundation and closely linked to kernel methods in machine learning. Attribute reduction algorithms and classifiers based on fuzzy rough set theory exhibit promising performance in the analysis of high-dimensional multivariate complex data. However, most existing models operate at the finest granularity, rendering them inefficient and sensitive to noise, especially for high-dimensional big data. Thus, enhancing the robustness of fuzzy rough set models is crucial for effective feature selection. Muiti-garanularty granular-ball computing, a recent development, uses granular-balls of different sizes to adaptively represent and cover the sample space, performing learning based on these granular-balls. This paper proposes integrating multi-granularity granular-ball computing into fuzzy rough set theory, using granular-balls to replace sample points. The coarse-grained characteristics of granular-balls make the model more robust. Additionally, we propose a new method for generating granular-balls, scalable to the entire supervised method based on granular-ball computing. A forward search algorithm is used to select feature sequences by defining the correlation between features and categories through dependence functions. Experiments demonstrate the proposed model's effectiveness and superiority over baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18413v1-abstract-full').style.display = 'none'; document.getElementById('2501.18413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16302">arXiv:2501.16302</a> <span> [<a href="https://arxiv.org/pdf/2501.16302">pdf</a>, <a href="https://arxiv.org/format/2501.16302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Matryoshka Re-Ranker: A Flexible Re-Ranking Architecture With Configurable Depth and Width </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaofan Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+D">Defu Lian</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yingxia Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16302v1-abstract-short" style="display: inline;"> Large language models (LLMs) provide powerful foundations to perform fine-grained text re-ranking. However, they are often prohibitive in reality due to constraints on computation bandwidth. In this work, we propose a \textbf{flexible} architecture called \textbf{Matroyshka Re-Ranker}, which is designed to facilitate \textbf{runtime customization} of model layers and sequence lengths at each layer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16302v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16302v1-abstract-full" style="display: none;"> Large language models (LLMs) provide powerful foundations to perform fine-grained text re-ranking. However, they are often prohibitive in reality due to constraints on computation bandwidth. In this work, we propose a \textbf{flexible} architecture called \textbf{Matroyshka Re-Ranker}, which is designed to facilitate \textbf{runtime customization} of model layers and sequence lengths at each layer based on users' configurations. Consequently, the LLM-based re-rankers can be made applicable across various real-world situations. The increased flexibility may come at the cost of precision loss. To address this problem, we introduce a suite of techniques to optimize the performance. First, we propose \textbf{cascaded self-distillation}, where each sub-architecture learns to preserve a precise re-ranking performance from its super components, whose predictions can be exploited as smooth and informative teacher signals. Second, we design a \textbf{factorized compensation mechanism}, where two collaborative Low-Rank Adaptation modules, vertical and horizontal, are jointly employed to compensate for the precision loss resulted from arbitrary combinations of layer and sequence compression. We perform comprehensive experiments based on the passage and document retrieval datasets from MSMARCO, along with all public datasets from BEIR benchmark. In our experiments, Matryoshka Re-Ranker substantially outperforms the existing methods, while effectively preserving its superior performance across various forms of compression and different application scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16302v1-abstract-full').style.display = 'none'; document.getElementById('2501.16302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The Web Conference 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13340">arXiv:2501.13340</a> <span> [<a href="https://arxiv.org/pdf/2501.13340">pdf</a>, <a href="https://arxiv.org/format/2501.13340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Retrievals Can Be Detrimental: A Contrastive Backdoor Attack Paradigm on Retrieval-Augmented Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+X">Xiaohang Sui</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongyao Yu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+J">Jiawei Kong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Sijin Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13340v1-abstract-short" style="display: inline;"> Diffusion models (DMs) have recently demonstrated remarkable generation capability. However, their training generally requires huge computational resources and large-scale datasets. To solve these, recent studies empower DMs with the advanced Retrieval-Augmented Generation (RAG) technique and propose retrieval-augmented diffusion models (RDMs). By incorporating rich knowledge from an auxiliary dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13340v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13340v1-abstract-full" style="display: none;"> Diffusion models (DMs) have recently demonstrated remarkable generation capability. However, their training generally requires huge computational resources and large-scale datasets. To solve these, recent studies empower DMs with the advanced Retrieval-Augmented Generation (RAG) technique and propose retrieval-augmented diffusion models (RDMs). By incorporating rich knowledge from an auxiliary database, RAG enhances diffusion models' generation and generalization ability while significantly reducing model parameters. Despite the great success, RAG may introduce novel security issues that warrant further investigation. In this paper, we reveal that the RDM is susceptible to backdoor attacks by proposing a multimodal contrastive attack approach named BadRDM. Our framework fully considers RAG's characteristics and is devised to manipulate the retrieved items for given text triggers, thereby further controlling the generated contents. Specifically, we first insert a tiny portion of images into the retrieval database as target toxicity surrogates. Subsequently, a malicious variant of contrastive learning is adopted to inject backdoors into the retriever, which builds shortcuts from triggers to the toxicity surrogates. Furthermore, we enhance the attacks through novel entropy-based selection and generative augmentation strategies that can derive better toxicity surrogates. Extensive experiments on two mainstream tasks demonstrate the proposed BadRDM achieves outstanding attack effects while preserving the model's benign utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13340v1-abstract-full').style.display = 'none'; document.getElementById('2501.13340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13041">arXiv:2501.13041</a> <span> [<a href="https://arxiv.org/pdf/2501.13041">pdf</a>, <a href="https://arxiv.org/format/2501.13041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TimeFilter: Patch-Specific Spatial-Temporal Graph Filtration for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yifan Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+D">Disen Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Naiqi Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Dawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shirui Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13041v1-abstract-short" style="display: inline;"> Current time series forecasting methods can be broadly classified into two categories: Channel Independent (CI) and Channel Dependent (CD) strategies, both aiming to capture the complex dependencies within time series data. However, the CI strategy fails to exploit highly correlated covariate information, while the CD strategy integrates all dependencies, including irrelevant or noisy ones, thus c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13041v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13041v1-abstract-full" style="display: none;"> Current time series forecasting methods can be broadly classified into two categories: Channel Independent (CI) and Channel Dependent (CD) strategies, both aiming to capture the complex dependencies within time series data. However, the CI strategy fails to exploit highly correlated covariate information, while the CD strategy integrates all dependencies, including irrelevant or noisy ones, thus compromising generalization. To mitigate these issues, recent works have introduced the Channel Clustering (CC) strategy by grouping channels with similar characteristics and applying different modeling techniques to each cluster. However, coarse-grained clustering cannot flexibly capture complex, time-varying interactions. Addressing the above challenges, we propose TimeFilter, a graph-based framework for adaptive and fine-grained dependency modeling. Specifically, after constructing the graph with the input sequence, TimeFilter filters out irrelevant correlations and preserves the most critical ones through patch-specific filtering. Extensive experiments on 13 real-world datasets from various application domains demonstrate the state-of-the-art performance of TimeFilter. The code is available at https://github.com/TROUBADOUR000/TimeFilter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13041v1-abstract-full').style.display = 'none'; document.getElementById('2501.13041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06233">arXiv:2501.06233</a> <span> [<a href="https://arxiv.org/pdf/2501.06233">pdf</a>, <a href="https://arxiv.org/format/2501.06233">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> </div> <p class="title is-5 mathjax"> Mechanics and Design of Metastructured Auxetic Patches with Bio-inspired Materials </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingbin Chen</a>, <a href="/search/cs?searchtype=author&query=Arzani%2C+M">Milad Arzani</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+X">Xuan Mu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Sophia Jin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shaoping Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06233v1-abstract-short" style="display: inline;"> Metastructured auxetic patches, characterized by negative Poisson's ratios, offer unique mechanical properties that closely resemble the behavior of human tissues and organs. As a result, these patches have gained significant attention for their potential applications in organ repair and tissue regeneration. This study focuses on neural networks-based computational modeling of auxetic patches with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06233v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06233v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06233v1-abstract-full" style="display: none;"> Metastructured auxetic patches, characterized by negative Poisson's ratios, offer unique mechanical properties that closely resemble the behavior of human tissues and organs. As a result, these patches have gained significant attention for their potential applications in organ repair and tissue regeneration. This study focuses on neural networks-based computational modeling of auxetic patches with a sinusoidal metastructure fabricated from silk fibroin, a bio-inspired material known for its biocompatibility and strength. The primary objective of this research is to introduce a novel, data-driven framework for patch design. To achieve this, we conducted experimental fabrication and mechanical testing to determine material properties and validate the corresponding finite element models. Finite element simulations were then employed to generate the necessary data, while greedy sampling, an active learning technique, was utilized to reduce the computational cost associated with data labeling. Two neural networks were trained to accurately predict Poisson's ratios and stresses for strains up to 15\%, respectively. Both models achieved $R^2$ scores exceeding 0.995, which indicates highly reliable predictions. Building on this, we developed a neural network-based design model capable of tailoring patch designs to achieve specific mechanical properties. This model demonstrated superior performance when compared to traditional optimization methods, such as genetic algorithms, by providing more efficient and precise design solutions. The proposed framework represents a significant advancement in the design of bio-inspired metastructures for medical applications, paving the way for future innovations in tissue engineering and regenerative medicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06233v1-abstract-full').style.display = 'none'; document.getElementById('2501.06233v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04940">arXiv:2501.04940</a> <span> [<a href="https://arxiv.org/pdf/2501.04940">pdf</a>, <a href="https://arxiv.org/format/2501.04940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A New Perspective on Privacy Protection in Federated Learning with Granular-Ball Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+G">Guannan Lai</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yihui Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiaoyu Deng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuyin Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoyin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianrui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04940v1-abstract-short" style="display: inline;"> Federated Learning (FL) facilitates collaborative model training while prioritizing privacy by avoiding direct data sharing. However, most existing articles attempt to address challenges within the model's internal parameters and corresponding outputs, while neglecting to solve them at the input level. To address this gap, we propose a novel framework called Granular-Ball Federated Learning (GrBFL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04940v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04940v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04940v1-abstract-full" style="display: none;"> Federated Learning (FL) facilitates collaborative model training while prioritizing privacy by avoiding direct data sharing. However, most existing articles attempt to address challenges within the model's internal parameters and corresponding outputs, while neglecting to solve them at the input level. To address this gap, we propose a novel framework called Granular-Ball Federated Learning (GrBFL) for image classification. GrBFL diverges from traditional methods that rely on the finest-grained input data. Instead, it segments images into multiple regions with optimal coarse granularity, which are then reconstructed into a graph structure. We designed a two-dimensional binary search segmentation algorithm based on variance constraints for GrBFL, which effectively removes redundant information while preserving key representative features. Extensive theoretical analysis and experiments demonstrate that GrBFL not only safeguards privacy and enhances efficiency but also maintains robust utility, consistently outperforming other state-of-the-art FL methods. The code is available at https://github.com/AIGNLAI/GrBFL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04940v1-abstract-full').style.display = 'none'; document.getElementById('2501.04940v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17589">arXiv:2412.17589</a> <span> [<a href="https://arxiv.org/pdf/2412.17589">pdf</a>, <a href="https://arxiv.org/format/2412.17589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PC Agent: While You Sleep, AI Works -- A Cognitive Journey into Digital World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yanheng He</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiahe Jin</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shijie Xia</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jiadi Su</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+R">Runze Fan</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Haoyang Zou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiangkun Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17589v1-abstract-short" style="display: inline;"> Imagine a world where AI can handle your work while you sleep - organizing your research materials, drafting a report, or creating a presentation you need for tomorrow. However, while current digital agents can perform simple tasks, they are far from capable of handling the complex real-world work that humans routinely perform. We present PC Agent, an AI system that demonstrates a crucial step tow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17589v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17589v1-abstract-full" style="display: none;"> Imagine a world where AI can handle your work while you sleep - organizing your research materials, drafting a report, or creating a presentation you need for tomorrow. However, while current digital agents can perform simple tasks, they are far from capable of handling the complex real-world work that humans routinely perform. We present PC Agent, an AI system that demonstrates a crucial step toward this vision through human cognition transfer. Our key insight is that the path from executing simple "tasks" to handling complex "work" lies in efficiently capturing and learning from human cognitive processes during computer use. To validate this hypothesis, we introduce three key innovations: (1) PC Tracker, a lightweight infrastructure that efficiently collects high-quality human-computer interaction trajectories with complete cognitive context; (2) a two-stage cognition completion pipeline that transforms raw interaction data into rich cognitive trajectories by completing action semantics and thought processes; and (3) a multi-agent system combining a planning agent for decision-making with a grounding agent for robust visual grounding. Our preliminary experiments in PowerPoint presentation creation reveal that complex digital work capabilities can be achieved with a small amount of high-quality cognitive data - PC Agent, trained on just 133 cognitive trajectories, can handle sophisticated work scenarios involving up to 50 steps across multiple applications. This demonstrates the data efficiency of our approach, highlighting that the key to training capable digital agents lies in collecting human cognitive data. By open-sourcing our complete framework, including the data collection infrastructure and cognition completion methods, we aim to lower the barriers for the research community to develop truly capable digital agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17589v1-abstract-full').style.display = 'none'; document.getElementById('2412.17589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16552">arXiv:2412.16552</a> <span> [<a href="https://arxiv.org/pdf/2412.16552">pdf</a>, <a href="https://arxiv.org/format/2412.16552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Prior Interpolation for Flexibility Real-World Face Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiarui Yang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yufei Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Naiqi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinmin Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shutao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16552v1-abstract-short" style="display: inline;"> Diffusion models represent the state-of-the-art in generative modeling. Due to their high training costs, many works leverage pre-trained diffusion models' powerful representations for downstream tasks, such as face super-resolution (FSR), through fine-tuning or prior-based methods. However, relying solely on priors without supervised training makes it challenging to meet the pixel-level accuracy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16552v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16552v1-abstract-full" style="display: none;"> Diffusion models represent the state-of-the-art in generative modeling. Due to their high training costs, many works leverage pre-trained diffusion models' powerful representations for downstream tasks, such as face super-resolution (FSR), through fine-tuning or prior-based methods. However, relying solely on priors without supervised training makes it challenging to meet the pixel-level accuracy requirements of discrimination task. Although prior-based methods can achieve high fidelity and high-quality results, ensuring consistency remains a significant challenge. In this paper, we propose a masking strategy with strong and weak constraints and iterative refinement for real-world FSR, termed Diffusion Prior Interpolation (DPI). We introduce conditions and constraints on consistency by masking different sampling stages based on the structural characteristics of the face. Furthermore, we propose a condition Corrector (CRT) to establish a reciprocal posterior sampling process, enhancing FSR performance by mutual refinement of conditions and samples. DPI can balance consistency and diversity and can be seamlessly integrated into pre-trained models. In extensive experiments conducted on synthetic and real datasets, along with consistency validation in face recognition, DPI demonstrates superiority over SOTA FSR methods. The code is available at \url{https://github.com/JerryYann/DPI}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16552v1-abstract-full').style.display = 'none'; document.getElementById('2412.16552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14518">arXiv:2412.14518</a> <span> [<a href="https://arxiv.org/pdf/2412.14518">pdf</a>, <a href="https://arxiv.org/format/2412.14518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Efficient Self-Supervised Video Hashing with Selective State Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+N">Niu Lian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuting Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yan Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongbing Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14518v1-abstract-short" style="display: inline;"> Self-supervised video hashing (SSVH) is a practical task in video indexing and retrieval. Although Transformers are predominant in SSVH for their impressive temporal modeling capabilities, they often suffer from computational and memory inefficiencies. Drawing inspiration from Mamba, an advanced state-space model, we explore its potential in SSVH to achieve a better balance between efficacy and ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14518v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14518v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14518v1-abstract-full" style="display: none;"> Self-supervised video hashing (SSVH) is a practical task in video indexing and retrieval. Although Transformers are predominant in SSVH for their impressive temporal modeling capabilities, they often suffer from computational and memory inefficiencies. Drawing inspiration from Mamba, an advanced state-space model, we explore its potential in SSVH to achieve a better balance between efficacy and efficiency. We introduce S5VH, a Mamba-based video hashing model with an improved self-supervised learning paradigm. Specifically, we design bidirectional Mamba layers for both the encoder and decoder, which are effective and efficient in capturing temporal relationships thanks to the data-dependent selective scanning mechanism with linear complexity. In our learning strategy, we transform global semantics in the feature space into semantically consistent and discriminative hash centers, followed by a center alignment loss as a global learning signal. Our self-local-global (SLG) paradigm significantly improves learning efficiency, leading to faster and better convergence. Extensive experiments demonstrate S5VH's improvements over state-of-the-art methods, superior transferability, and scalable advantages in inference efficiency. Code is available at https://github.com/gimpong/AAAI25-S5VH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14518v1-abstract-full').style.display = 'none'; document.getElementById('2412.14518v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI'25. 9 pages, 5 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14479">arXiv:2412.14479</a> <span> [<a href="https://arxiv.org/pdf/2412.14479">pdf</a>, <a href="https://arxiv.org/format/2412.14479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Frenzy: A Memory-Aware Serverless LLM Training System for Heterogeneous GPU Clusters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+Z">Zihan Chang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Sheng Xiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shuibing He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siling Yang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhe Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14479v1-abstract-short" style="display: inline;"> Existing work only effective on a given number of GPUs, often neglecting the complexities involved in manually determining the specific types and quantities of GPUs needed, which can be a significant burden for developers. To address this issue, we propose Frenzy, a memory-aware serverless computing method for heterogeneous GPU clusters. Frenzy allows users to submit models without worrying about… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14479v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14479v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14479v1-abstract-full" style="display: none;"> Existing work only effective on a given number of GPUs, often neglecting the complexities involved in manually determining the specific types and quantities of GPUs needed, which can be a significant burden for developers. To address this issue, we propose Frenzy, a memory-aware serverless computing method for heterogeneous GPU clusters. Frenzy allows users to submit models without worrying about underlying hardware resources. First, Frenzy predicts the required number and type of GPUs by estimating the GPU memory usage of the LLM. Then, it employs a low-overhead heterogeneity-aware scheduling method to optimize training efficiency. We validated Frenzy's performance by conducting multi-task LLM training tests on a heterogeneous GPU cluster with three different GPU types. The results show that Frenzy's memory usage prediction accuracy exceeds 92\%, the scheduling overhead is reduced by 10 times, and it reduces the average job completion time by 12\% to 18\% compared to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14479v1-abstract-full').style.display = 'none'; document.getElementById('2412.14479v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14475">arXiv:2412.14475</a> <span> [<a href="https://arxiv.org/pdf/2412.14475">pdf</a>, <a href="https://arxiv.org/format/2412.14475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MegaPairs: Massive Data Synthesis For Universal Multimodal Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junjie Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ze Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yueze Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bo Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+J">Chen Jason Zhang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+D">Defu Lian</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yongping Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14475v1-abstract-short" style="display: inline;"> Despite the rapidly growing demand for multimodal retrieval, progress in this field remains severely constrained by a lack of training data. In this paper, we introduce MegaPairs, a novel data synthesis method that leverages vision language models (VLMs) and open-domain images, together with a massive synthetic dataset generated from this method. Our empirical analysis shows that MegaPairs generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14475v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14475v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14475v1-abstract-full" style="display: none;"> Despite the rapidly growing demand for multimodal retrieval, progress in this field remains severely constrained by a lack of training data. In this paper, we introduce MegaPairs, a novel data synthesis method that leverages vision language models (VLMs) and open-domain images, together with a massive synthetic dataset generated from this method. Our empirical analysis shows that MegaPairs generates high-quality data, enabling the multimodal retriever to significantly outperform the baseline model trained on 70$\times$ more data from existing datasets. Moreover, since MegaPairs solely relies on general image corpora and open-source VLMs, it can be easily scaled up, enabling continuous improvements in retrieval performance. In this stage, we produced more than 26 million training instances and trained several models of varying sizes using this data. These new models achieve state-of-the-art zero-shot performance across 4 popular composed image retrieval (CIR) benchmarks and the highest overall performance on the 36 datasets provided by MMEB. They also demonstrate notable performance improvements with additional downstream fine-tuning. Our produced dataset, well-trained models, and data synthesis pipeline will be made publicly available to facilitate the future development of this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14475v1-abstract-full').style.display = 'none'; document.getElementById('2412.14475v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13842">arXiv:2412.13842</a> <span> [<a href="https://arxiv.org/pdf/2412.13842">pdf</a>, <a href="https://arxiv.org/format/2412.13842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Graph Coarsening via Supervised Granular-Ball for Scalable Graph Neural Network Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuyin Xia</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xinjun Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sen Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoyin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13842v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have demonstrated significant achievements in processing graph data, yet scalability remains a substantial challenge. To address this, numerous graph coarsening methods have been developed. However, most existing coarsening methods are training-dependent, leading to lower efficiency, and they all require a predefined coarsening rate, lacking an adaptive approach. In th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13842v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13842v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have demonstrated significant achievements in processing graph data, yet scalability remains a substantial challenge. To address this, numerous graph coarsening methods have been developed. However, most existing coarsening methods are training-dependent, leading to lower efficiency, and they all require a predefined coarsening rate, lacking an adaptive approach. In this paper, we employ granular-ball computing to effectively compress graph data. We construct a coarsened graph network by iteratively splitting the graph into granular-balls based on a purity threshold and using these granular-balls as super vertices. This granulation process significantly reduces the size of the original graph, thereby greatly enhancing the training efficiency and scalability of GNNs. Additionally, our algorithm can adaptively perform splitting without requiring a predefined coarsening rate. Experimental results demonstrate that our method achieves accuracy comparable to training on the original graph. Noise injection experiments further indicate that our method exhibits robust performance. Moreover, our approach can reduce the graph size by up to 20 times without compromising test accuracy, substantially enhancing the scalability of GNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13842v1-abstract-full').style.display = 'none'; document.getElementById('2412.13842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13542">arXiv:2412.13542</a> <span> [<a href="https://arxiv.org/pdf/2412.13542">pdf</a>, <a href="https://arxiv.org/format/2412.13542">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Granularity Open Intent Classification via Adaptive Granular-Ball Decision Boundary </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanhua Li</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+X">Xiaocao Ouyang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+C">Chaofan Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sen Zhao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuyin Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoyin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianrui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13542v1-abstract-short" style="display: inline;"> Open intent classification is critical for the development of dialogue systems, aiming to accurately classify known intents into their corresponding classes while identifying unknown intents. Prior boundary-based methods assumed known intents fit within compact spherical regions, focusing on coarse-grained representation and precise spherical decision boundaries. However, these assumptions are oft… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13542v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13542v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13542v1-abstract-full" style="display: none;"> Open intent classification is critical for the development of dialogue systems, aiming to accurately classify known intents into their corresponding classes while identifying unknown intents. Prior boundary-based methods assumed known intents fit within compact spherical regions, focusing on coarse-grained representation and precise spherical decision boundaries. However, these assumptions are often violated in practical scenarios, making it difficult to distinguish known intent classes from unknowns using a single spherical boundary. To tackle these issues, we propose a Multi-granularity Open intent classification method via adaptive Granular-Ball decision boundary (MOGB). Our MOGB method consists of two modules: representation learning and decision boundary acquiring. To effectively represent the intent distribution, we design a hierarchical representation learning method. This involves iteratively alternating between adaptive granular-ball clustering and nearest sub-centroid classification to capture fine-grained semantic structures within known intent classes. Furthermore, multi-granularity decision boundaries are constructed for open intent classification by employing granular-balls with varying centroids and radii. Extensive experiments conducted on three public datasets demonstrate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13542v1-abstract-full').style.display = 'none'; document.getElementById('2412.13542v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been Accepted on AAAI2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> AAAI2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13137">arXiv:2412.13137</a> <span> [<a href="https://arxiv.org/pdf/2412.13137">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unlocking the Potential of Digital Pathology: Novel Baselines for Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fischer%2C+M">Maximilian Fischer</a>, <a href="/search/cs?searchtype=author&query=Neher%2C+P">Peter Neher</a>, <a href="/search/cs?searchtype=author&query=Sch%C3%BCffler%2C+P">Peter Sch眉ffler</a>, <a href="/search/cs?searchtype=author&query=Ziegler%2C+S">Sebastian Ziegler</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuhan Xiao</a>, <a href="/search/cs?searchtype=author&query=Peretzke%2C+R">Robin Peretzke</a>, <a href="/search/cs?searchtype=author&query=Clunie%2C+D">David Clunie</a>, <a href="/search/cs?searchtype=author&query=Ulrich%2C+C">Constantin Ulrich</a>, <a href="/search/cs?searchtype=author&query=Baumgartner%2C+M">Michael Baumgartner</a>, <a href="/search/cs?searchtype=author&query=Muckenhuber%2C+A">Alexander Muckenhuber</a>, <a href="/search/cs?searchtype=author&query=Almeida%2C+S+D">Silvia Dias Almeida</a>, <a href="/search/cs?searchtype=author&query=G%C3%B6tz%2C+M">Michael G枚tz</a>, <a href="/search/cs?searchtype=author&query=Kleesiek%2C+J">Jens Kleesiek</a>, <a href="/search/cs?searchtype=author&query=Nolden%2C+M">Marco Nolden</a>, <a href="/search/cs?searchtype=author&query=Braren%2C+R">Rickmer Braren</a>, <a href="/search/cs?searchtype=author&query=Maier-Hein%2C+K">Klaus Maier-Hein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13137v1-abstract-short" style="display: inline;"> Digital pathology offers a groundbreaking opportunity to transform clinical practice in histopathological image analysis, yet faces a significant hurdle: the substantial file sizes of pathological Whole Slide Images (WSI). While current digital pathology solutions rely on lossy JPEG compression to address this issue, lossy compression can introduce color and texture disparities, potentially impact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13137v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13137v1-abstract-full" style="display: none;"> Digital pathology offers a groundbreaking opportunity to transform clinical practice in histopathological image analysis, yet faces a significant hurdle: the substantial file sizes of pathological Whole Slide Images (WSI). While current digital pathology solutions rely on lossy JPEG compression to address this issue, lossy compression can introduce color and texture disparities, potentially impacting clinical decision-making. While prior research addresses perceptual image quality and downstream performance independently of each other, we jointly evaluate compression schemes for perceptual and downstream task quality on four different datasets. In addition, we collect an initially uncompressed dataset for an unbiased perceptual evaluation of compression schemes. Our results show that deep learning models fine-tuned for perceptual quality outperform conventional compression schemes like JPEG-XL or WebP for further compression of WSI. However, they exhibit a significant bias towards the compression artifacts present in the training data and struggle to generalize across various compression schemes. We introduce a novel evaluation metric based on feature similarity between original files and compressed files that aligns very well with the actual downstream performance on the compressed WSI. Our metric allows for a general and standardized evaluation of lossy compression schemes and mitigates the requirement to independently assess different downstream tasks. Our study provides novel insights for the assessment of lossy compression schemes for WSI and encourages a unified evaluation of lossy compression schemes to accelerate the clinical uptake of digital pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13137v1-abstract-full').style.display = 'none'; document.getElementById('2412.13137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13102">arXiv:2412.13102</a> <span> [<a href="https://arxiv.org/pdf/2412.13102">pdf</a>, <a href="https://arxiv.org/format/2412.13102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianlyu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaofan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Han Xiao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Hao Liao</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+D">Defu Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13102v3-abstract-short" style="display: inline;"> Evaluation plays a crucial role in the advancement of information retrieval (IR) models. However, current benchmarks, which are based on predefined domains and human-labeled data, face limitations in addressing evaluation needs for emerging domains both cost-effectively and efficiently. To address this challenge, we propose the Automated Heterogeneous Information Retrieval Benchmark (AIR-Bench). A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13102v3-abstract-full').style.display = 'inline'; document.getElementById('2412.13102v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13102v3-abstract-full" style="display: none;"> Evaluation plays a crucial role in the advancement of information retrieval (IR) models. However, current benchmarks, which are based on predefined domains and human-labeled data, face limitations in addressing evaluation needs for emerging domains both cost-effectively and efficiently. To address this challenge, we propose the Automated Heterogeneous Information Retrieval Benchmark (AIR-Bench). AIR-Bench is distinguished by three key features: 1) Automated. The testing data in AIR-Bench is automatically generated by large language models (LLMs) without human intervention. 2) Heterogeneous. The testing data in AIR-Bench is generated with respect to diverse tasks, domains and languages. 3) Dynamic. The domains and languages covered by AIR-Bench are constantly augmented to provide an increasingly comprehensive evaluation benchmark for community developers. We develop a reliable and robust data generation pipeline to automatically create diverse and high-quality evaluation datasets based on real-world corpora. Our findings demonstrate that the generated testing data in AIR-Bench aligns well with human-labeled testing data, making AIR-Bench a dependable benchmark for evaluating IR models. The resources in AIR-Bench are publicly available at https://github.com/AIR-Bench/AIR-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13102v3-abstract-full').style.display = 'none'; document.getElementById('2412.13102v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 6 figures; Update Table 4 and Figure 3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10461">arXiv:2412.10461</a> <span> [<a href="https://arxiv.org/pdf/2412.10461">pdf</a>, <a href="https://arxiv.org/format/2412.10461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> EvoSampling: A Granular Ball-based Evolutionary Hybrid Sampling with Knowledge Transfer for Imbalanced Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pei%2C+W">Wenbin Pei</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+R">Ruohao Dai</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+Y">Yiu-Ming Cheung</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuyin Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10461v1-abstract-short" style="display: inline;"> Class imbalance would lead to biased classifiers that favor the majority class and disadvantage the minority class. Unfortunately, from a practical perspective, the minority class is of importance in many real-life applications. Hybrid sampling methods address this by oversampling the minority class to increase the number of its instances, followed by undersampling to remove low-quality instances.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10461v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10461v1-abstract-full" style="display: none;"> Class imbalance would lead to biased classifiers that favor the majority class and disadvantage the minority class. Unfortunately, from a practical perspective, the minority class is of importance in many real-life applications. Hybrid sampling methods address this by oversampling the minority class to increase the number of its instances, followed by undersampling to remove low-quality instances. However, most existing sampling methods face difficulties in generating diverse high-quality instances and often fail to remove noise or low-quality instances on a larger scale effectively. This paper therefore proposes an evolutionary multi-granularity hybrid sampling method, called EvoSampling. During the oversampling process, genetic programming (GP) is used with multi-task learning to effectively and efficiently generate diverse high-quality instances. During the undersampling process, we develop a granular ball-based undersampling method that removes noise in a multi-granular fashion, thereby enhancing data quality. Experiments on 20 imbalanced datasets demonstrate that EvoSampling effectively enhances the performance of various classification algorithms by providing better datasets than existing sampling methods. Besides, ablation studies further indicate that allowing knowledge transfer accelerates the GP's evolutionary learning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10461v1-abstract-full').style.display = 'none'; document.getElementById('2412.10461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10235">arXiv:2412.10235</a> <span> [<a href="https://arxiv.org/pdf/2412.10235">pdf</a>, <a href="https://arxiv.org/format/2412.10235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EnvPoser: Environment-aware Realistic Human Motion Estimation from Sparse Observations with Uncertainty Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+S">Songpengcheng Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhuo Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaozheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Z">Zheng Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guidong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qi Wu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+L">Lei Chu</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+L">Ling Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10235v1-abstract-short" style="display: inline;"> Estimating full-body motion using the tracking signals of head and hands from VR devices holds great potential for various applications. However, the sparsity and unique distribution of observations present a significant challenge, resulting in an ill-posed problem with multiple feasible solutions (i.e., hypotheses). This amplifies uncertainty and ambiguity in full-body motion estimation, especial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10235v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10235v1-abstract-full" style="display: none;"> Estimating full-body motion using the tracking signals of head and hands from VR devices holds great potential for various applications. However, the sparsity and unique distribution of observations present a significant challenge, resulting in an ill-posed problem with multiple feasible solutions (i.e., hypotheses). This amplifies uncertainty and ambiguity in full-body motion estimation, especially for the lower-body joints. Therefore, we propose a new method, EnvPoser, that employs a two-stage framework to perform full-body motion estimation using sparse tracking signals and pre-scanned environment from VR devices. EnvPoser models the multi-hypothesis nature of human motion through an uncertainty-aware estimation module in the first stage. In the second stage, we refine these multi-hypothesis estimates by integrating semantic and geometric environmental constraints, ensuring that the final motion estimation aligns realistically with both the environmental context and physical interactions. Qualitative and quantitative experiments on two public datasets demonstrate that our method achieves state-of-the-art performance, highlighting significant improvements in human motion estimation within motion-environment interaction scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10235v1-abstract-full').style.display = 'none'; document.getElementById('2412.10235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09945">arXiv:2412.09945</a> <span> [<a href="https://arxiv.org/pdf/2412.09945">pdf</a>, <a href="https://arxiv.org/format/2412.09945">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Going Beyond Feature Similarity: Effective Dataset distillation based on Class-aware Conditional Mutual Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xinhao Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xulin Gu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+E">En-Hui Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09945v1-abstract-short" style="display: inline;"> Dataset distillation (DD) aims to minimize the time and memory consumption needed for training deep neural networks on large datasets, by creating a smaller synthetic dataset that has similar performance to that of the full real dataset. However, current dataset distillation methods often result in synthetic datasets that are excessively difficult for networks to learn from, due to the compression… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09945v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09945v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09945v1-abstract-full" style="display: none;"> Dataset distillation (DD) aims to minimize the time and memory consumption needed for training deep neural networks on large datasets, by creating a smaller synthetic dataset that has similar performance to that of the full real dataset. However, current dataset distillation methods often result in synthetic datasets that are excessively difficult for networks to learn from, due to the compression of a substantial amount of information from the original data through metrics measuring feature similarity, e,g., distribution matching (DM). In this work, we introduce conditional mutual information (CMI) to assess the class-aware complexity of a dataset and propose a novel method by minimizing CMI. Specifically, we minimize the distillation loss while constraining the class-aware complexity of the synthetic dataset by minimizing its empirical CMI from the feature space of pre-trained networks, simultaneously. Conducting on a thorough set of experiments, we show that our method can serve as a general regularization method to existing DD methods and improve the performance and training efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09945v1-abstract-full').style.display = 'none'; document.getElementById('2412.09945v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09001">arXiv:2412.09001</a> <span> [<a href="https://arxiv.org/pdf/2412.09001">pdf</a>, <a href="https://arxiv.org/format/2412.09001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> MindScratch: A Visual Programming Support Tool for Classroom Learning Based on Multimodal Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunnong Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuhong Xiao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yaxuan Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zejian Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingyun Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuqing Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09001v1-abstract-short" style="display: inline;"> Programming has become an essential component of K-12 education and serves as a pathway for developing computational thinking skills. Given the complexity of programming and the advanced skills it requires, previous research has introduced user-friendly tools to support young learners. However, our interviews with six programming educators revealed that current tools often fail to reflect classroo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09001v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09001v1-abstract-full" style="display: none;"> Programming has become an essential component of K-12 education and serves as a pathway for developing computational thinking skills. Given the complexity of programming and the advanced skills it requires, previous research has introduced user-friendly tools to support young learners. However, our interviews with six programming educators revealed that current tools often fail to reflect classroom learning objectives, offer flexible, high-quality guidance, and foster student creativity. This highlights the need for more adaptive and reflective tools. Therefore, we introduced MindScratch, a multimodal generative AI (GAI) powered visual programming support tool. MindScratch aims to balance structured classroom activities with free programming creation, supporting students in completing creative programming projects based on teacher-set learning objectives while also providing programming scaffolding. Our user study results indicate that, compared to the baseline, MindScratch more effectively helps students achieve high-quality projects aligned with learning objectives. It also enhances students' computational thinking skills and creative thinking. Overall, we believe that GAI-driven educational tools like MindScratch offer students a focused and engaging learning experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09001v1-abstract-full').style.display = 'none'; document.getElementById('2412.09001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages,7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08521">arXiv:2412.08521</a> <span> [<a href="https://arxiv.org/pdf/2412.08521">pdf</a>, <a href="https://arxiv.org/format/2412.08521">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EMS: Adaptive Evict-then-Merge Strategy for Head-wise KV Cache Compression Based on Global-Local Importance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingxin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ye Li</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Y">Yuan Meng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xinzhu Ma</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Z">Zihan Geng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shutao Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08521v1-abstract-short" style="display: inline;"> As large language models (LLMs) continue to advance, the demand for higher quality and faster processing of long contexts across various applications is growing. KV cache is widely adopted as it stores previously generated key and value tokens, effectively reducing redundant computations during inference. However, as memory overhead becomes a significant concern, efficient compression of KV cache… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08521v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08521v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08521v1-abstract-full" style="display: none;"> As large language models (LLMs) continue to advance, the demand for higher quality and faster processing of long contexts across various applications is growing. KV cache is widely adopted as it stores previously generated key and value tokens, effectively reducing redundant computations during inference. However, as memory overhead becomes a significant concern, efficient compression of KV cache has gained increasing attention. Most existing methods perform compression from two perspectives: identifying important tokens and designing compression strategies. However, these approaches often produce biased distributions of important tokens due to the influence of accumulated attention scores or positional encoding. Furthermore, they overlook the sparsity and redundancy across different heads, which leads to difficulties in preserving the most effective information at the head level. To this end, we propose EMS to overcome these limitations, while achieving better KV cache compression under extreme compression ratios. Specifically, we introduce a Global-Local score that combines accumulated attention scores from both global and local KV tokens to better identify the token importance. For the compression strategy, we design an adaptive and unified Evict-then-Merge framework that accounts for the sparsity and redundancy of KV tokens across different heads. Additionally, we implement the head-wise parallel compression through a zero-class mechanism to enhance efficiency. Extensive experiments demonstrate our SOTA performance even under extreme compression ratios. EMS consistently achieves the lowest perplexity, improves scores by over 1.28 points across four LLMs on LongBench under a 256 cache budget, and preserves 95% retrieval accuracy with a cache budget less than 2% of the context length in the Needle-in-a-Haystack task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08521v1-abstract-full').style.display = 'none'; document.getElementById('2412.08521v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07277">arXiv:2412.07277</a> <span> [<a href="https://arxiv.org/pdf/2412.07277">pdf</a>, <a href="https://arxiv.org/format/2412.07277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Backdoor Attacks against No-Reference Image Quality Assessment Models via a Scalable Trigger </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Song Xia</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xun Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A">Alex Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07277v2-abstract-short" style="display: inline;"> No-Reference Image Quality Assessment (NR-IQA), responsible for assessing the quality of a single input image without using any reference, plays a critical role in evaluating and optimizing computer vision systems, e.g., low-light enhancement. Recent research indicates that NR-IQA models are susceptible to adversarial attacks, which can significantly alter predicted scores with visually impercepti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07277v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07277v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07277v2-abstract-full" style="display: none;"> No-Reference Image Quality Assessment (NR-IQA), responsible for assessing the quality of a single input image without using any reference, plays a critical role in evaluating and optimizing computer vision systems, e.g., low-light enhancement. Recent research indicates that NR-IQA models are susceptible to adversarial attacks, which can significantly alter predicted scores with visually imperceptible perturbations. Despite revealing vulnerabilities, these attack methods have limitations, including high computational demands, untargeted manipulation, limited practical utility in white-box scenarios, and reduced effectiveness in black-box scenarios. To address these challenges, we shift our focus to another significant threat and present a novel poisoning-based backdoor attack against NR-IQA (BAIQA), allowing the attacker to manipulate the IQA model's output to any desired target value by simply adjusting a scaling coefficient $伪$ for the trigger. We propose to inject the trigger in the discrete cosine transform (DCT) domain to improve the local invariance of the trigger for countering trigger diminishment in NR-IQA models due to widely adopted data augmentations. Furthermore, the universal adversarial perturbations (UAP) in the DCT space are designed as the trigger, to increase IQA model susceptibility to manipulation and improve attack effectiveness. In addition to the heuristic method for poison-label BAIQA (P-BAIQA), we explore the design of clean-label BAIQA (C-BAIQA), focusing on $伪$ sampling and image data refinement, driven by theoretical insights we reveal. Extensive experiments on diverse datasets and various NR-IQA models demonstrate the effectiveness of our attacks. Code can be found at https://github.com/yuyi-sd/BAIQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07277v2-abstract-full').style.display = 'none'; document.getElementById('2412.07277v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05695">arXiv:2412.05695</a> <span> [<a href="https://arxiv.org/pdf/2412.05695">pdf</a>, <a href="https://arxiv.org/format/2412.05695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> WATER-GS: Toward Copyright Protection for 3D Gaussian Splatting via Universal Watermarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yuqi Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05695v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has emerged as a pivotal technique for 3D scene representation, providing rapid rendering speeds and high fidelity. As 3DGS gains prominence, safeguarding its intellectual property becomes increasingly crucial since 3DGS could be used to imitate unauthorized scene creations and raise copyright issues. Existing watermarking methods for implicit NeRFs cannot be directly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05695v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05695v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has emerged as a pivotal technique for 3D scene representation, providing rapid rendering speeds and high fidelity. As 3DGS gains prominence, safeguarding its intellectual property becomes increasingly crucial since 3DGS could be used to imitate unauthorized scene creations and raise copyright issues. Existing watermarking methods for implicit NeRFs cannot be directly applied to 3DGS due to its explicit representation and real-time rendering process, leaving watermarking for 3DGS largely unexplored. In response, we propose WATER-GS, a novel method designed to protect 3DGS copyrights through a universal watermarking strategy. First, we introduce a pre-trained watermark decoder, treating raw 3DGS generative modules as potential watermark encoders to ensure imperceptibility. Additionally, we implement novel 3D distortion layers to enhance the robustness of the embedded watermark against common real-world distortions of point cloud data. Comprehensive experiments and ablation studies demonstrate that WATER-GS effectively embeds imperceptible and robust watermarks into 3DGS without compromising rendering efficiency and quality. Our experiments indicate that the 3D distortion layers can yield up to a 20% improvement in accuracy rate. Notably, our method is adaptable to different 3DGS variants, including 3DGS compression frameworks and 2D Gaussian splatting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05695v1-abstract-full').style.display = 'none'; document.getElementById('2412.05695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05555">arXiv:2412.05555</a> <span> [<a href="https://arxiv.org/pdf/2412.05555">pdf</a>, <a href="https://arxiv.org/format/2412.05555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fragmented Layer Grouping in GUI Designs Through Graph Learning Based on Multimodal Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunnong Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuhong Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiazhi Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tingting Zhou</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yanfang Chang</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+Y">Yankun Zhen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingyun Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuqing Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05555v1-abstract-short" style="display: inline;"> Automatically constructing GUI groups of different granularities constitutes a critical intelligent step towards automating GUI design and implementation tasks. Specifically, in the industrial GUI-to-code process, fragmented layers may decrease the readability and maintainability of generated code, which can be alleviated by grouping semantically consistent fragmented layers in the design prototyp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05555v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05555v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05555v1-abstract-full" style="display: none;"> Automatically constructing GUI groups of different granularities constitutes a critical intelligent step towards automating GUI design and implementation tasks. Specifically, in the industrial GUI-to-code process, fragmented layers may decrease the readability and maintainability of generated code, which can be alleviated by grouping semantically consistent fragmented layers in the design prototypes. This study aims to propose a graph-learning-based approach to tackle the fragmented layer grouping problem according to multi-modal information in design prototypes. Our graph learning module consists of self-attention and graph neural network modules. By taking the multimodal fused representation of GUI layers as input, we innovatively group fragmented layers by classifying GUI layers and regressing the bounding boxes of the corresponding GUI components simultaneously. Experiments on two real-world datasets demonstrate that our model achieves state-of-the-art performance. A further user study is also conducted to validate that our approach can assist an intelligent downstream tool in generating more maintainable and readable front-end code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05555v1-abstract-full').style.display = 'none'; document.getElementById('2412.05555v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05167">arXiv:2412.05167</a> <span> [<a href="https://arxiv.org/pdf/2412.05167">pdf</a>, <a href="https://arxiv.org/format/2412.05167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Open-ended Audio Dialogue Understanding for Large Audio-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kuofeng Gao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Ke Xu</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05167v1-abstract-short" style="display: inline;"> Large Audio-Language Models (LALMs) have unclocked audio dialogue capabilities, where audio dialogues are a direct exchange of spoken language between LALMs and humans. Recent advances, such as GPT-4o, have enabled LALMs in back-and-forth audio dialogues with humans. This progression not only underscores the potential of LALMs but also broadens their applicability across a wide range of practical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05167v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05167v1-abstract-full" style="display: none;"> Large Audio-Language Models (LALMs) have unclocked audio dialogue capabilities, where audio dialogues are a direct exchange of spoken language between LALMs and humans. Recent advances, such as GPT-4o, have enabled LALMs in back-and-forth audio dialogues with humans. This progression not only underscores the potential of LALMs but also broadens their applicability across a wide range of practical scenarios supported by audio dialogues. However, given these advancements, a comprehensive benchmark to evaluate the performance of LALMs in the open-ended audio dialogue understanding remains absent currently. To address this gap, we propose an Audio Dialogue Understanding Benchmark (ADU-Bench), which consists of 4 benchmark datasets. They assess the open-ended audio dialogue ability for LALMs in 3 general scenarios, 12 skills, 9 multilingual languages, and 4 categories of ambiguity handling. Notably, we firstly propose the evaluation of ambiguity handling in audio dialogues that expresses different intentions beyond the same literal meaning of sentences, e.g., "Really!?" with different intonations. In summary, ADU-Bench includes over 20,000 open-ended audio dialogues for the assessment of LALMs. Through extensive experiments conducted on 13 LALMs, our analysis reveals that there is still considerable room for improvement in the audio dialogue understanding abilities of existing LALMs. In particular, they struggle with mathematical symbols and formulas, understanding human behavior such as roleplay, comprehending multiple languages, and handling audio dialogue ambiguities from different phonetic elements, such as intonations, pause positions, and homophones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05167v1-abstract-full').style.display = 'none'; document.getElementById('2412.05167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04842">arXiv:2412.04842</a> <span> [<a href="https://arxiv.org/pdf/2412.04842">pdf</a>, <a href="https://arxiv.org/format/2412.04842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniMLVG: Unified Framework for Multi-view Long Video Generation with Comprehensive Control Capabilities for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zehuan Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yichen Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuxin Guo</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jingcheng Ni</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+H">Haifeng Xia</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Siyu Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04842v2-abstract-short" style="display: inline;"> The creation of diverse and realistic driving scenarios has become essential to enhance perception and planning capabilities of the autonomous driving system. However, generating long-duration, surround-view consistent driving videos remains a significant challenge. To address this, we present UniMLVG, a unified framework designed to generate extended street multi-perspective videos under precise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04842v2-abstract-full').style.display = 'inline'; document.getElementById('2412.04842v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04842v2-abstract-full" style="display: none;"> The creation of diverse and realistic driving scenarios has become essential to enhance perception and planning capabilities of the autonomous driving system. However, generating long-duration, surround-view consistent driving videos remains a significant challenge. To address this, we present UniMLVG, a unified framework designed to generate extended street multi-perspective videos under precise control. By integrating single- and multi-view driving videos into the training data, our approach updates cross-frame and cross-view modules across three stages with different training objectives, substantially boosting the diversity and quality of generated visual content. Additionally, we employ the explicit viewpoint modeling in multi-view video generation to effectively improve motion transition consistency. Capable of handling various input reference formats (e.g., text, images, or video), our UniMLVG generates high-quality multi-view videos according to the corresponding condition constraints such as 3D bounding boxes or frame-level text descriptions. Compared to the best models with similar capabilities, our framework achieves improvements of 21.4% in FID and 36.5% in FVD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04842v2-abstract-full').style.display = 'none'; document.getElementById('2412.04842v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02081">arXiv:2412.02081</a> <span> [<a href="https://arxiv.org/pdf/2412.02081">pdf</a>, <a href="https://arxiv.org/format/2412.02081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Let's Think Var-by-Var: Large Language Models Enable Ad Hoc Probabilistic Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shepard Xia</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+B">Brian Lu</a>, <a href="/search/cs?searchtype=author&query=Eisner%2C+J">Jason Eisner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02081v1-abstract-short" style="display: inline;"> A hallmark of intelligence is the ability to flesh out underspecified situations using "common sense." We propose to extract that common sense from large language models (LLMs), in a form that can feed into probabilistic inference. We focus our investigation on $\textit{guesstimation}$ questions such as "How much are Airbnb listings in Newark, NJ?" Formulating a sensible answer without access to d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02081v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02081v1-abstract-full" style="display: none;"> A hallmark of intelligence is the ability to flesh out underspecified situations using "common sense." We propose to extract that common sense from large language models (LLMs), in a form that can feed into probabilistic inference. We focus our investigation on $\textit{guesstimation}$ questions such as "How much are Airbnb listings in Newark, NJ?" Formulating a sensible answer without access to data requires drawing on, and integrating, bits of common knowledge about how $\texttt{Price}$ and $\texttt{Location}$ may relate to other variables, such as $\texttt{Property Type}$. Our framework answers such a question by synthesizing an $\textit{ad hoc}$ probabilistic model. First we prompt an LLM to propose a set of random variables relevant to the question, followed by moment constraints on their joint distribution. We then optimize the joint distribution $p$ within a log-linear family to maximize the overall constraint satisfaction. Our experiments show that LLMs can successfully be prompted to propose reasonable variables, and while the proposed numerical constraints can be noisy, jointly optimizing for their satisfaction reconciles them. When evaluated on probabilistic questions derived from three real-world tabular datasets, we find that our framework performs comparably to a direct prompting baseline in terms of total variation distance from the dataset distribution, and is similarly robust to noise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02081v1-abstract-full').style.display = 'none'; document.getElementById('2412.02081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01379">arXiv:2412.01379</a> <span> [<a href="https://arxiv.org/pdf/2412.01379">pdf</a>, <a href="https://arxiv.org/format/2412.01379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A deformation-based framework for learning solution mappings of PDEs defined on varying domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shanshan Xiao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Pengzhan Jin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yifa Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01379v1-abstract-short" style="display: inline;"> In this work, we establish a deformation-based framework for learning solution mappings of PDEs defined on varying domains. The union of functions defined on varying domains can be identified as a metric space according to the deformation, then the solution mapping is regarded as a continuous metric-to-metric mapping, and subsequently can be represented by another continuous metric-to-Banach mappi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01379v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01379v1-abstract-full" style="display: none;"> In this work, we establish a deformation-based framework for learning solution mappings of PDEs defined on varying domains. The union of functions defined on varying domains can be identified as a metric space according to the deformation, then the solution mapping is regarded as a continuous metric-to-metric mapping, and subsequently can be represented by another continuous metric-to-Banach mapping using two different strategies, referred to as the D2D framework and the D2E framework, respectively. We point out that such a metric-to-Banach mapping can be learned by neural networks, hence the solution mapping is accordingly learned. With this framework, a rigorous convergence analysis is built for the problem of learning solution mappings of PDEs on varying domains. As the theoretical framework holds based on several pivotal assumptions which need to be verified for a given specific problem, we study the star domains as a typical example, and other situations could be similarly verified. There are three important features of this framework: (1) The domains under consideration are not required to be diffeomorphic, therefore a wide range of regions can be covered by one model provided they are homeomorphic. (2) The deformation mapping is unnecessary to be continuous, thus it can be flexibly established via combining a primary identity mapping and a local deformation mapping. This capability facilitates the resolution of large systems where only local parts of the geometry undergo change. (3) If a linearity-preserving neural operator such as MIONet is adopted, this framework still preserves the linearity of the surrogate solution mapping on its source term for linear PDEs, thus it can be applied to the hybrid iterative method. We finally present several numerical experiments to validate our theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01379v1-abstract-full').style.display = 'none'; document.getElementById('2412.01379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00579">arXiv:2412.00579</a> <span> [<a href="https://arxiv.org/pdf/2412.00579">pdf</a>, <a href="https://arxiv.org/format/2412.00579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> Operator learning regularization for macroscopic permeability prediction in dual-scale flow problem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Runkel%2C+C">Christina Runkel</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Sinan Xiao</a>, <a href="/search/cs?searchtype=author&query=Boull%C3%A9%2C+N">Nicolas Boull茅</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00579v1-abstract-short" style="display: inline;"> Liquid composites moulding is an important manufacturing technology for fibre reinforced composites, due to its cost-effectiveness. Challenges lie in the optimisation of the process due to the lack of understanding of key characteristic of textile fabrics - permeability. The problem of computing the permeability coefficient can be modelled as the well-known Stokes-Brinkman equation, which introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00579v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00579v1-abstract-full" style="display: none;"> Liquid composites moulding is an important manufacturing technology for fibre reinforced composites, due to its cost-effectiveness. Challenges lie in the optimisation of the process due to the lack of understanding of key characteristic of textile fabrics - permeability. The problem of computing the permeability coefficient can be modelled as the well-known Stokes-Brinkman equation, which introduces a heterogeneous parameter $尾$ distinguishing macropore regions and fibre-bundle regions. In the present work, we train a Fourier neural operator to learn the nonlinear map from the heterogeneous coefficient $尾$ to the velocity field $u$, and recover the corresponding macroscopic permeability $K$. This is a challenging inverse problem since both the input and output fields span several order of magnitudes, we introduce different regularization techniques for the loss function and perform a quantitative comparison between them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00579v1-abstract-full').style.display = 'none'; document.getElementById('2412.00579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00440">arXiv:2412.00440</a> <span> [<a href="https://arxiv.org/pdf/2412.00440">pdf</a>, <a href="https://arxiv.org/format/2412.00440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing Myopia To Holism: Fully Contrastive Language-Image Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+C">Chen Ju</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weixiong Lin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuai Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mengting Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yixuan Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Mingshuai Yao</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jinsong Lan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingwen Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00440v1-abstract-short" style="display: inline;"> In rapidly evolving field of vision-language models (VLMs), contrastive language-image pre-training (CLIP) has made significant strides, becoming foundation for various downstream tasks. However, relying on one-to-one (image, text) contrastive paradigm to learn alignment from large-scale messy web data, CLIP faces a serious myopic dilemma, resulting in biases towards monotonous short texts and sha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00440v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00440v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00440v1-abstract-full" style="display: none;"> In rapidly evolving field of vision-language models (VLMs), contrastive language-image pre-training (CLIP) has made significant strides, becoming foundation for various downstream tasks. However, relying on one-to-one (image, text) contrastive paradigm to learn alignment from large-scale messy web data, CLIP faces a serious myopic dilemma, resulting in biases towards monotonous short texts and shallow visual expressivity. To overcome these issues, this paper advances CLIP into one novel holistic paradigm, by updating both diverse data and alignment optimization. To obtain colorful data with low cost, we use image-to-text captioning to generate multi-texts for each image, from multiple perspectives, granularities, and hierarchies. Two gadgets are proposed to encourage textual diversity. To match such (image, multi-texts) pairs, we modify the CLIP image encoder into multi-branch, and propose multi-to-multi contrastive optimization for image-text part-to-part matching. As a result, diverse visual embeddings are learned for each image, bringing good interpretability and generalization. Extensive experiments and ablations across over ten benchmarks indicate that our holistic CLIP significantly outperforms existing myopic CLIP, including image-text retrieval, open-vocabulary classification, and dense visual tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00440v1-abstract-full').style.display = 'none'; document.getElementById('2412.00440v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19714">arXiv:2411.19714</a> <span> [<a href="https://arxiv.org/pdf/2411.19714">pdf</a>, <a href="https://arxiv.org/format/2411.19714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Streetscape Application Services Stack (SASS): Towards a Distributed Sensing Architecture for Urban Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pargoo%2C+N+S">Navid Salami Pargoo</a>, <a href="/search/cs?searchtype=author&query=Ghasemi%2C+M">Mahshid Ghasemi</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuren Xia</a>, <a href="/search/cs?searchtype=author&query=Turkcan%2C+M+K">Mehmet Kerem Turkcan</a>, <a href="/search/cs?searchtype=author&query=Ehsan%2C+T">Taqiya Ehsan</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+C">Chengbo Zang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuan Sun</a>, <a href="/search/cs?searchtype=author&query=Ghaderi%2C+J">Javad Ghaderi</a>, <a href="/search/cs?searchtype=author&query=Zussman%2C+G">Gil Zussman</a>, <a href="/search/cs?searchtype=author&query=Kostic%2C+Z">Zoran Kostic</a>, <a href="/search/cs?searchtype=author&query=Ortiz%2C+J">Jorge Ortiz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19714v2-abstract-short" style="display: inline;"> As urban populations grow, cities are becoming more complex, driving the deployment of interconnected sensing systems to realize the vision of smart cities. These systems aim to improve safety, mobility, and quality of life through applications that integrate diverse sensors with real-time decision-making. Streetscape applications-focusing on challenges like pedestrian safety and adaptive traffic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19714v2-abstract-full').style.display = 'inline'; document.getElementById('2411.19714v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19714v2-abstract-full" style="display: none;"> As urban populations grow, cities are becoming more complex, driving the deployment of interconnected sensing systems to realize the vision of smart cities. These systems aim to improve safety, mobility, and quality of life through applications that integrate diverse sensors with real-time decision-making. Streetscape applications-focusing on challenges like pedestrian safety and adaptive traffic management-depend on managing distributed, heterogeneous sensor data, aligning information across time and space, and enabling real-time processing. These tasks are inherently complex and often difficult to scale. The Streetscape Application Services Stack (SASS) addresses these challenges with three core services: multimodal data synchronization, spatiotemporal data fusion, and distributed edge computing. By structuring these capabilities as clear, composable abstractions with clear semantics, SASS allows developers to scale streetscape applications efficiently while minimizing the complexity of multimodal integration. We evaluated SASS in two real-world testbed environments: a controlled parking lot and an urban intersection in a major U.S. city. These testbeds allowed us to test SASS under diverse conditions, demonstrating its practical applicability. The Multimodal Data Synchronization service reduced temporal misalignment errors by 88%, achieving synchronization accuracy within 50 milliseconds. Spatiotemporal Data Fusion service improved detection accuracy for pedestrians and vehicles by over 10%, leveraging multicamera integration. The Distributed Edge Computing service increased system throughput by more than an order of magnitude. Together, these results show how SASS provides the abstractions and performance needed to support real-time, scalable urban applications, bridging the gap between sensing infrastructure and actionable streetscape intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19714v2-abstract-full').style.display = 'none'; document.getElementById('2411.19714v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19102">arXiv:2411.19102</a> <span> [<a href="https://arxiv.org/pdf/2411.19102">pdf</a>, <a href="https://arxiv.org/format/2411.19102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 360Recon: An Accurate Reconstruction Method Based on Depth Fusion from 360 Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhongmiao Yan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qi Wu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Songpengcheng Xia</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Junyuan Deng</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+X">Xiang Mu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+R">Renbiao Jin</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+L">Ling Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19102v1-abstract-short" style="display: inline;"> 360-degree images offer a significantly wider field of view compared to traditional pinhole cameras, enabling sparse sampling and dense 3D reconstruction in low-texture environments. This makes them crucial for applications in VR, AR, and related fields. However, the inherent distortion caused by the wide field of view affects feature extraction and matching, leading to geometric consistency issue… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19102v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19102v1-abstract-full" style="display: none;"> 360-degree images offer a significantly wider field of view compared to traditional pinhole cameras, enabling sparse sampling and dense 3D reconstruction in low-texture environments. This makes them crucial for applications in VR, AR, and related fields. However, the inherent distortion caused by the wide field of view affects feature extraction and matching, leading to geometric consistency issues in subsequent multi-view reconstruction. In this work, we propose 360Recon, an innovative MVS algorithm for ERP images. The proposed spherical feature extraction module effectively mitigates distortion effects, and by combining the constructed 3D cost volume with multi-scale enhanced features from ERP images, our approach achieves high-precision scene reconstruction while preserving local geometric consistency. Experimental results demonstrate that 360Recon achieves state-of-the-art performance and high efficiency in depth estimation and 3D reconstruction on existing public panoramic reconstruction datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19102v1-abstract-full').style.display = 'none'; document.getElementById('2411.19102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18631">arXiv:2411.18631</a> <span> [<a href="https://arxiv.org/pdf/2411.18631">pdf</a>, <a href="https://arxiv.org/format/2411.18631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Counterfactual Learning-Driven Representation Disentanglement for Search-Enhanced Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jiajun Cui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuai Xiao</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+C">Chen Ju</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jinsong Lan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingwen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18631v1-abstract-short" style="display: inline;"> For recommender systems in internet platforms, search activities provide additional insights into user interest through query-click interactions with items, and are thus widely used for enhancing personalized recommendation. However, these interacted items not only have transferable features matching users' interest helpful for the recommendation domain, but also have features related to users' un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18631v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18631v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18631v1-abstract-full" style="display: none;"> For recommender systems in internet platforms, search activities provide additional insights into user interest through query-click interactions with items, and are thus widely used for enhancing personalized recommendation. However, these interacted items not only have transferable features matching users' interest helpful for the recommendation domain, but also have features related to users' unique intents in the search domain. Such domain gap of item features is neglected by most current search-enhanced recommendation methods. They directly incorporate these search behaviors into recommendation, and thus introduce partial negative transfer. To address this, we propose a Counterfactual learning-driven representation disentanglement framework for search-enhanced recommendation, based on the common belief that a user would click an item under a query not solely because of the item-query match but also due to the item's query-independent general features (e.g., color or style) that interest the user. These general features exclude the reflection of search-specific intents contained in queries, ensuring a pure match to users' underlying interest to complement recommendation. According to counterfactual thinking, how would user preferences and query match change for items if we removed their query-related features in search, we leverage search queries to construct counterfactual signals to disentangle item representations, isolating only query-independent general features. These representations subsequently enable feature augmentation and data augmentation for the recommendation scenario. Comprehensive experiments on real datasets demonstrate ClardRec is effective in both collaborative filtering and sequential recommendation scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18631v1-abstract-full').style.display = 'none'; document.getElementById('2411.18631v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16489">arXiv:2411.16489</a> <span> [<a href="https://arxiv.org/pdf/2411.16489">pdf</a>, <a href="https://arxiv.org/format/2411.16489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> O1 Replication Journey -- Part 2: Surpassing O1-preview through Simple Distillation, Big Progress or Bitter Lesson? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhen Huang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Haoyang Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuefeng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yixiu Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yuxiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Chern%2C+E">Ethan Chern</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shijie Xia</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yiwei Qin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pengfei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16489v1-abstract-short" style="display: inline;"> This paper presents a critical examination of current approaches to replicating OpenAI's O1 model capabilities, with particular focus on the widespread but often undisclosed use of knowledge distillation techniques. While our previous work explored the fundamental technical path to O1 replication, this study reveals how simple distillation from O1's API, combined with supervised fine-tuning, can a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16489v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16489v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16489v1-abstract-full" style="display: none;"> This paper presents a critical examination of current approaches to replicating OpenAI's O1 model capabilities, with particular focus on the widespread but often undisclosed use of knowledge distillation techniques. While our previous work explored the fundamental technical path to O1 replication, this study reveals how simple distillation from O1's API, combined with supervised fine-tuning, can achieve superior performance on complex mathematical reasoning tasks. Through extensive experiments, we show that a base model fine-tuned on simply tens of thousands of samples O1-distilled long-thought chains outperforms O1-preview on the American Invitational Mathematics Examination (AIME) with minimal technical complexity. Moreover, our investigation extends beyond mathematical reasoning to explore the generalization capabilities of O1-distilled models across diverse tasks: hallucination, safety and open-domain QA. Notably, despite training only on mathematical problem-solving data, our models demonstrated strong generalization to open-ended QA tasks and became significantly less susceptible to sycophancy after fine-tuning. We deliberately make this finding public to promote transparency in AI research and to challenge the current trend of obscured technical claims in the field. Our work includes: (1) A detailed technical exposition of the distillation process and its effectiveness, (2) A comprehensive benchmark framework for evaluating and categorizing O1 replication attempts based on their technical transparency and reproducibility, (3) A critical discussion of the limitations and potential risks of over-relying on distillation approaches, our analysis culminates in a crucial bitter lesson: while the pursuit of more capable AI systems is important, the development of researchers grounded in first-principles thinking is paramount. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16489v1-abstract-full').style.display = 'none'; document.getElementById('2411.16489v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15702">arXiv:2411.15702</a> <span> [<a href="https://arxiv.org/pdf/2411.15702">pdf</a>, <a href="https://arxiv.org/format/2411.15702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Editable-DeepSC: Reliable Cross-Modal Semantic Communications for Facial Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenbo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinshan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15702v1-abstract-short" style="display: inline;"> Real-time computer vision (CV) plays a crucial role in various real-world applications, whose performance is highly dependent on communication networks. Nonetheless, the data-oriented characteristics of conventional communications often do not align with the special needs of real-time CV tasks. To alleviate this issue, the recently emerged semantic communications only transmit task-related semanti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15702v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15702v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15702v1-abstract-full" style="display: none;"> Real-time computer vision (CV) plays a crucial role in various real-world applications, whose performance is highly dependent on communication networks. Nonetheless, the data-oriented characteristics of conventional communications often do not align with the special needs of real-time CV tasks. To alleviate this issue, the recently emerged semantic communications only transmit task-related semantic information and exhibit a promising landscape to address this problem. However, the communication challenges associated with Semantic Facial Editing, one of the most important real-time CV applications on social media, still remain largely unexplored. In this paper, we fill this gap by proposing Editable-DeepSC, a novel cross-modal semantic communication approach for facial editing. Firstly, we theoretically discuss different transmission schemes that separately handle communications and editings, and emphasize the necessity of Joint Editing-Channel Coding (JECC) via iterative attributes matching, which integrates editings into the communication chain to preserve more semantic mutual information. To compactly represent the high-dimensional data, we leverage inversion methods via pre-trained StyleGAN priors for semantic coding. To tackle the dynamic channel noise conditions, we propose SNR-aware channel coding via model fine-tuning. Extensive experiments indicate that Editable-DeepSC can achieve superior editings while significantly saving the transmission bandwidth, even under high-resolution and out-of-distribution (OOD) settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15702v1-abstract-full').style.display = 'none'; document.getElementById('2411.15702v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15281">arXiv:2411.15281</a> <span> [<a href="https://arxiv.org/pdf/2411.15281">pdf</a>, <a href="https://arxiv.org/format/2411.15281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ElastiFormer: Learned Redundancy Reduction in Transformer via Self-Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junzhang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tingkai Liu</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Y">Yueyuan Sui</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Stephen Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15281v1-abstract-short" style="display: inline;"> We introduce ElastiFormer, a post-training technique that adapts pretrained Transformer models into an elastic counterpart with variable inference time compute. ElastiFormer introduces small routing modules (as low as .00006% additional trainable parameters) to dynamically selects subsets of network parameters and input tokens to be processed by each layer of the pretrained network in an inputdepe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15281v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15281v1-abstract-full" style="display: none;"> We introduce ElastiFormer, a post-training technique that adapts pretrained Transformer models into an elastic counterpart with variable inference time compute. ElastiFormer introduces small routing modules (as low as .00006% additional trainable parameters) to dynamically selects subsets of network parameters and input tokens to be processed by each layer of the pretrained network in an inputdependent manner. The routing modules are trained using self-distillation losses to minimize the differences between the output of the pretrained-model and their elastic counterparts. As ElastiFormer makes no assumption regarding the modality of the pretrained Transformer model, it can be readily applied to all modalities covering causal language modeling, image modeling as well as visual-language modeling tasks. We show that 20% to 50% compute saving could be achieved for different components of the transformer layer, which could be further reduced by adding very low rank LoRA weights (rank 1) trained via the same distillation objective. Finally, by comparing routing trained on different subsets of ImageNet, we show that ElastiFormer is robust against the training domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15281v1-abstract-full').style.display = 'none'; document.getElementById('2411.15281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15269">arXiv:2411.15269</a> <span> [<a href="https://arxiv.org/pdf/2411.15269">pdf</a>, <a href="https://arxiv.org/format/2411.15269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MambaIRv2: Attentive State Space Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hang Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yong Guo</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Y">Yaohua Zha</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yawei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15269v1-abstract-short" style="display: inline;"> The Mamba-based image restoration backbones have recently demonstrated significant potential in balancing global reception and computational efficiency. However, the inherent causal modeling limitation of Mamba, where each token depends solely on its predecessors in the scanned sequence, restricts the full utilization of pixels across the image and thus presents new challenges in image restoration… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15269v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15269v1-abstract-full" style="display: none;"> The Mamba-based image restoration backbones have recently demonstrated significant potential in balancing global reception and computational efficiency. However, the inherent causal modeling limitation of Mamba, where each token depends solely on its predecessors in the scanned sequence, restricts the full utilization of pixels across the image and thus presents new challenges in image restoration. In this work, we propose MambaIRv2, which equips Mamba with the non-causal modeling ability similar to ViTs to reach the attentive state space restoration model. Specifically, the proposed attentive state-space equation allows to attend beyond the scanned sequence and facilitate image unfolding with just one single scan. Moreover, we further introduce a semantic-guided neighboring mechanism to encourage interaction between distant but similar pixels. Extensive experiments show our MambaIRv2 outperforms SRFormer by \textbf{even 0.35dB} PSNR for lightweight SR even with \textbf{9.3\% less} parameters and suppresses HAT on classic SR by \textbf{up to 0.29dB}. Code is available at \url{https://github.com/csguoh/MambaIR}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15269v1-abstract-full').style.display = 'none'; document.getElementById('2411.15269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13057">arXiv:2411.13057</a> <span> [<a href="https://arxiv.org/pdf/2411.13057">pdf</a>, <a href="https://arxiv.org/format/2411.13057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Branches, Assemble! Multi-Branch Cooperation Network for Large-Scale Click-Through Rate Prediction at Taobao </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zida Cheng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yuangang Pan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuai Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoming Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jinsong Lan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingwen Liu</a>, <a href="/search/cs?searchtype=author&query=Tsang%2C+I+W">Ivor W. Tsang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13057v1-abstract-short" style="display: inline;"> Existing click-through rate (CTR) prediction works have studied the role of feature interaction through a variety of techniques. Each interaction technique exhibits its own strength, and solely using one type could constrain the model's capability to capture the complex feature relationships, especially for industrial large-scale data with enormous users and items. Recent research shows that effec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13057v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13057v1-abstract-full" style="display: none;"> Existing click-through rate (CTR) prediction works have studied the role of feature interaction through a variety of techniques. Each interaction technique exhibits its own strength, and solely using one type could constrain the model's capability to capture the complex feature relationships, especially for industrial large-scale data with enormous users and items. Recent research shows that effective CTR models often combine an MLP network with a dedicated feature interaction network in a two-parallel structure. However, the interplay and cooperative dynamics between different streams or branches remain under-researched. In this work, we introduce a novel Multi-Branch Cooperation Network (MBCnet) which enables multiple branch networks to collaborate with each other for better complex feature interaction modeling. Specifically, MBCnet consists of three branches: the Expert-based Feature Grouping and Crossing (EFGC) branch that promotes the model's memorization ability of specific feature fields, the low rank Cross Net branch and Deep branch to enhance both explicit and implicit feature crossing for improved generalization. Among branches, a novel cooperation scheme is proposed based on two principles: branch co-teaching and moderate differentiation. Branch co-teaching encourages well-learned branches to support poorly-learned ones on specific training samples. Moderate differentiation advocates branches to maintain a reasonable level of difference in their feature representations. The cooperation strategy improves learning through mutual knowledge sharing via co-teaching and boosts the discovery of diverse feature interactions across branches. Extensive experiments on large-scale industrial datasets and online A/B test demonstrate MBCnet's superior performance, delivering a 0.09 point increase in CTR, 1.49% growth in deals, and 1.62% rise in GMV. Core codes will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13057v1-abstract-full').style.display = 'none'; document.getElementById('2411.13057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09259">arXiv:2411.09259</a> <span> [<a href="https://arxiv.org/pdf/2411.09259">pdf</a>, <a href="https://arxiv.org/format/2411.09259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Jailbreak Attacks and Defenses against Multimodal Generative Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuannan Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+X">Xing Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peipei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zekun Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huaibo Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuhan Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miaoxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yueying Zou</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Ran He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09259v2-abstract-short" style="display: inline;"> The rapid evolution of multimodal foundation models has led to significant advancements in cross-modal understanding and generation across diverse modalities, including text, images, audio, and video. However, these models remain susceptible to jailbreak attacks, which can bypass built-in safety mechanisms and induce the production of potentially harmful content. Consequently, understanding the me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09259v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09259v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09259v2-abstract-full" style="display: none;"> The rapid evolution of multimodal foundation models has led to significant advancements in cross-modal understanding and generation across diverse modalities, including text, images, audio, and video. However, these models remain susceptible to jailbreak attacks, which can bypass built-in safety mechanisms and induce the production of potentially harmful content. Consequently, understanding the methods of jailbreak attacks and existing defense mechanisms is essential to ensure the safe deployment of multimodal generative models in real-world scenarios, particularly in security-sensitive applications. To provide comprehensive insight into this topic, this survey reviews jailbreak and defense in multimodal generative models. First, given the generalized lifecycle of multimodal jailbreak, we systematically explore attacks and corresponding defense strategies across four levels: input, encoder, generator, and output. Based on this analysis, we present a detailed taxonomy of attack methods, defense mechanisms, and evaluation frameworks specific to multimodal generative models. Additionally, we cover a wide range of input-output configurations, including modalities such as Any-to-Text, Any-to-Vision, and Any-to-Any within generative systems. Finally, we highlight current research challenges and propose potential directions for future research. The open-source repository corresponding to this work can be found at https://github.com/liuxuannan/Awesome-Multimodal-Jailbreak. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09259v2-abstract-full').style.display = 'none'; document.getElementById('2411.09259v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ongoing work</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xia%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository