@PhdThesis{Almansa:2024:DD3,
author = {Eva Almansa},
title = {Data-driven depth and 3D architectural layout estimation of an interior environment from monocular panoramic input},
school = {PhD Programme in Mathematics and Computer Science, University of Cagliari, Italy},
year = {2024},
abstract = { In recent years, there has been significant research interest in the automatic 3D reconstruction and modeling of indoor scenes from capture data, giving rise to an emerging sub-field within 3D reconstruction. The primary goal is to convert an input source, which represents a sample of a real-world indoor environment, into a model that may encompass geometric, structural, and/or visual abstractions. Within the scope of this thesis, the focus has been on the extraction of geometric information from a single panoramic image, either by using only visual data or aided by very sparse registered depth information. This particular setup has attracted a lot of interest in recent years, since 360$^\circ$ images offer rapid and comprehensive single-image coverage and they are supported by a wide range of professional and consumer capture devices, which makes the data acquisition process both efficient and cost-effective. On the other hand, despite the 360-degree coverage, inferring a comprehensive model from mostly visual input in presence of noise, missing data, and clutter remains very challenging. Thus, my research has focused on finding clever ways to exploit prior information, in the form of architectural priors and data-driven priors derived from large sets of examples, to design end-to-end deep learning solutions to solve well-defined fundamental tasks in the structured reconstruction pipeline. The tasks on which I have focused are, in particular, depth estimation from a single 360-degree image, depth completion from a single 360-degree image enriched with sparse depth measurements, and 3D architectural layout estimation from a single 360-degree image. While the first two problems produce pixel-wise input in terms of a dense depth map, the latter consists in the reconstruction, from the image of the furnished room, of a simplified model of the 3D shape of the bounding permanent surfaces of a room. As a first contribution towards reconstructing indoor information from purely visual data, I introduced a novel deep neural network to estimate a depth map from a single monocular indoor panorama. The network directly works on the equirectangular projection, exploiting the properties of indoor 360-degree images. Starting from the fact that gravity plays an important role in the design and construction of man-made indoor scenes, the network compactly encodes the scene into vertical spherical slices, and exploits long- and short-term relationships among slices to recover an equirectangular depth map directly from an equirectangular RGB image. My second contribution expands this approach to the common situation in which we receive as input a single equirectangular image registered with a sparse depth map, as provided by a variety of common capture setups. In this approach, depth is inferred by an efficient and lightweight single-branch network, which employs a dynamic gating system to process together dense visual data and sparse geometric data. Furthermore, a new augmentation strategy makes the model robust to different types of sparsity, including those generated by various structured light sensors and LiDAR setups. While the two preceding contribution focus on the estimation of per-pixel geometric information, my third contribution has tackled the problem of recovering the 3D shape of the bounding permanent surfaces of a room from a single panoramic image. The method also exploits gravity-alighted features, but within a significantly different setup, dictacted by the fact that not only we need to separate walls, ceilings, and floor, but we need to recover the plausible shape of invisible areas. The proposed approach, differently from prior state-of-the-art methods, fully addresses the problem in 3D, significantly expanding the reconstruction space. In particular, a graph convolutional network directly infers the room structure as a 3D mesh by progressively deforming a graph-encoded tessellated sphere mapped to the spherical panorama, leveraging perceptual features extracted from the input image. Gravity-aligned features are actively incorporated in the graph in a projection layer that exploits the recent concept of multi head self-attention, and specialized losses guide towards plausible solutions even in presence of massive clutter and occlusions. The benchmarks on publicly available data show that all three methods are on par or better with respect to the state-of-the-art. },
url = {http://vic.crs4.it/vic/cgi-bin/bib-page.cgi?id='Almansa:2024:DD3'},
}
|