1 /* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3  *     Copyright 2010 Couchbase, Inc
4  *
5  *   Licensed under the Apache License, Version 2.0 (the "License");
6  *   you may not use this file except in compliance with the License.
7  *   You may obtain a copy of the License at
8  *
9  *       http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *   Unless required by applicable law or agreed to in writing, software
12  *   distributed under the License is distributed on an "AS IS" BASIS,
13  *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *   See the License for the specific language governing permissions and
15  *   limitations under the License.
16  */
17 
18 #include <sys/types.h>
19 #include <sys/stat.h>
20 #include <unistd.h>
21 #include <stdint.h>
22 #include <fcntl.h>
23 #include <stdio.h>
24 #include <errno.h>
25 #include <string.h>
26 
27 #include "filemgr.h"
28 #include "filemgr_ops.h"
29 
30 #if !defined(WIN32) && !defined(_WIN32)
31 
_filemgr_linux_open(const char *pathname, int flags, mode_t mode)32 int _filemgr_linux_open(const char *pathname, int flags, mode_t mode)
33 {
34     int fd;
35     do {
36         fd = open(pathname, flags | O_LARGEFILE, mode);
37     } while (fd == -1 && errno == EINTR);
38 
39     if (fd < 0) {
40         return (int) convert_errno_to_fdb_status(errno, // LCOV_EXCL_LINE
41                                                  FDB_RESULT_OPEN_FAIL);
42     }
43     return fd;
44 }
45 
_filemgr_linux_pwrite(int fd, void *buf, size_t count, cs_off_t offset)46 ssize_t _filemgr_linux_pwrite(int fd, void *buf, size_t count, cs_off_t offset)
47 {
48     ssize_t rv;
49     do {
50         rv = pwrite(fd, buf, count, offset);
51     } while (rv == -1 && errno == EINTR); // LCOV_EXCL_LINE
52 
53     if (rv < 0) {
54         return (ssize_t) convert_errno_to_fdb_status(errno, // LCOV_EXCL_LINE
55                                                      FDB_RESULT_WRITE_FAIL);
56     }
57     return rv;
58 }
59 
_filemgr_linux_pread(int fd, void *buf, size_t count, cs_off_t offset)60 ssize_t _filemgr_linux_pread(int fd, void *buf, size_t count, cs_off_t offset)
61 {
62     ssize_t rv;
63     do {
64         rv = pread(fd, buf, count, offset);
65     } while (rv == -1 && errno == EINTR); // LCOV_EXCL_LINE
66 
67     if (rv < 0) {
68         return (ssize_t) convert_errno_to_fdb_status(errno, // LCOV_EXCL_LINE
69                                                      FDB_RESULT_READ_FAIL);
70     }
71     return rv;
72 }
73 
_filemgr_linux_close(int fd)74 int _filemgr_linux_close(int fd)
75 {
76     int rv = 0;
77     if (fd != -1) {
78         do {
79             rv = close(fd);
80         } while (rv == -1 && errno == EINTR); // LCOV_EXCL_LINE
81     }
82 
83     if (rv < 0) {
84         return (int) convert_errno_to_fdb_status(errno, // LCOV_EXCL_LINE
85                                                  FDB_RESULT_CLOSE_FAIL);
86     }
87 
88     return FDB_RESULT_SUCCESS;
89 }
90 
_filemgr_linux_goto_eof(int fd)91 cs_off_t _filemgr_linux_goto_eof(int fd)
92 {
93     cs_off_t rv = lseek(fd, 0, SEEK_END);
94     if (rv < 0) {
95         return (cs_off_t) convert_errno_to_fdb_status(errno, // LCOV_EXCL_LINE
96                                                       FDB_RESULT_SEEK_FAIL);
97     }
98     return rv;
99 }
100 
101 // LCOV_EXCL_START
_filemgr_linux_file_size(const char *filename)102 cs_off_t _filemgr_linux_file_size(const char *filename)
103 {
104     struct stat st;
105     if (stat(filename, &st) == -1) {
106         return (cs_off_t) convert_errno_to_fdb_status(errno,
107                                                       FDB_RESULT_READ_FAIL);
108     }
109     return st.st_size;
110 }
111 // LCOV_EXCL_STOP
112 
_filemgr_linux_fsync(int fd)113 int _filemgr_linux_fsync(int fd)
114 {
115     int rv;
116     do {
117         rv = fsync(fd);
118     } while (rv == -1 && errno == EINTR); // LCOV_EXCL_LINE
119 
120     if (rv == -1) {
121         return (int) convert_errno_to_fdb_status(errno, // LCOV_EXCL_LINE
122                                                  FDB_RESULT_FSYNC_FAIL);
123     }
124 
125     return FDB_RESULT_SUCCESS;
126 }
127 
128 // LCOV_EXCL_START
_filemgr_linux_fdatasync(int fd)129 int _filemgr_linux_fdatasync(int fd)
130 {
131 #if defined(__linux__) && !defined(__ANDROID__)
132     int rv;
133     do {
134         rv = fdatasync(fd);
135     } while (rv == -1 && errno == EINTR);
136 
137     if (rv == -1) {
138         return (int) convert_errno_to_fdb_status(errno, // LCOV_EXCL_LINE
139                                                  FDB_RESULT_FSYNC_FAIL);
140     }
141 
142     return FDB_RESULT_SUCCESS;
143 #else // __linux__ && not __ANDROID__
144     return _filemgr_linux_fsync(fd);
145 #endif // __linux__ && not __ANDROID__
146 }
147 // LCOV_EXCL_STOP
148 
_filemgr_linux_get_errno_str(char *buf, size_t size)149 void _filemgr_linux_get_errno_str(char *buf, size_t size) {
150     if (!buf) {
151         return;
152     } else {
153         char *tbuf = alca(char, size);
154 #ifdef _POSIX_SOURCE
155         char *ret = strerror_r(errno, tbuf, size);
156         snprintf(buf, size, "errno = %d: '%s'", errno, ret);
157 #else
158         (void)strerror_r(errno, tbuf, size);
159         snprintf(buf, size, "errno = %d: '%s'", errno, tbuf);
160 #endif
161     }
162 }
163 
_filemgr_aio_init(struct async_io_handle *aio_handle)164 int _filemgr_aio_init(struct async_io_handle *aio_handle)
165 {
166 #ifdef _ASYNC_IO
167     if (!aio_handle) {
168         return FDB_RESULT_INVALID_ARGS;
169     }
170     if (!aio_handle->queue_depth || aio_handle->queue_depth > 512) {
171         aio_handle->queue_depth =  ASYNC_IO_QUEUE_DEPTH;
172     }
173     if (!aio_handle->block_size) {
174         aio_handle->block_size = FDB_BLOCKSIZE;
175     }
176 
177     void *buf;
178     malloc_align(buf, FDB_SECTOR_SIZE,
179                  aio_handle->block_size * aio_handle->queue_depth);
180     aio_handle->aio_buf = (uint8_t *) buf;
181     aio_handle->offset_array = (uint64_t*)
182         malloc(sizeof(uint64_t) * aio_handle->queue_depth);
183 
184     aio_handle->ioq = (struct iocb**)
185         malloc(sizeof(struct iocb*) * aio_handle->queue_depth);
186     aio_handle->events = (struct io_event *)
187         calloc(aio_handle->queue_depth, sizeof(struct io_event));
188 
189     for (size_t k = 0; k < aio_handle->queue_depth; ++k) {
190         aio_handle->ioq[k] = (struct iocb*) malloc(sizeof(struct iocb));
191     }
192     memset(&aio_handle->ioctx, 0, sizeof(io_context_t));
193 
194     int rc = io_queue_init(aio_handle->queue_depth, &aio_handle->ioctx);
195     if (rc < 0) {
196         return FDB_RESULT_AIO_INIT_FAIL;
197     }
198     return FDB_RESULT_SUCCESS;
199 #else
200     return FDB_RESULT_AIO_NOT_SUPPORTED;
201 #endif
202 }
203 
_filemgr_aio_prep_read(struct async_io_handle *aio_handle, size_t aio_idx, size_t read_size, uint64_t offset)204 int _filemgr_aio_prep_read(struct async_io_handle *aio_handle, size_t aio_idx,
205                            size_t read_size, uint64_t offset)
206 {
207 #ifdef _ASYNC_IO
208     if (!aio_handle) {
209         return FDB_RESULT_INVALID_ARGS;
210     }
211     io_prep_pread(aio_handle->ioq[aio_idx], aio_handle->fd,
212                   aio_handle->aio_buf + (aio_idx * aio_handle->block_size),
213                   aio_handle->block_size,
214                   (offset / aio_handle->block_size) * aio_handle->block_size);
215     // Record the original offset.
216     aio_handle->offset_array[aio_idx] = offset;
217     aio_handle->ioq[aio_idx]->data = &aio_handle->offset_array[aio_idx];
218     return FDB_RESULT_SUCCESS;
219 #else
220     return FDB_RESULT_AIO_NOT_SUPPORTED;
221 #endif
222 }
223 
_filemgr_aio_submit(struct async_io_handle *aio_handle, int num_subs)224 int _filemgr_aio_submit(struct async_io_handle *aio_handle, int num_subs)
225 {
226 #ifdef _ASYNC_IO
227     if (!aio_handle) {
228         return FDB_RESULT_INVALID_ARGS;
229     }
230     int rc = io_submit(aio_handle->ioctx, num_subs, aio_handle->ioq);
231     if (rc < 0) {
232         return FDB_RESULT_AIO_SUBMIT_FAIL;
233     }
234     return rc; // 'rc' should be equal to 'num_subs' upon succcess.
235 #else
236     return FDB_RESULT_AIO_NOT_SUPPORTED;
237 #endif
238 }
239 
_filemgr_aio_getevents(struct async_io_handle *aio_handle, int min, int max, unsigned int timeout)240 int _filemgr_aio_getevents(struct async_io_handle *aio_handle, int min,
241                            int max, unsigned int timeout)
242 {
243 #ifdef _ASYNC_IO
244     if (!aio_handle) {
245         return FDB_RESULT_INVALID_ARGS;
246     }
247 
248     // Passing max timeout (ms) means that it waits until at least 'min' events
249     // have been seen.
250     bool wait_for_min = true;
251     struct timespec ts;
252     if (timeout < (unsigned int) -1) {
253         ts.tv_sec = timeout / 1000;
254         timeout %= 1000;
255         ts.tv_nsec = timeout * 1000000;
256         wait_for_min = false;
257     }
258 
259     int num_events = io_getevents(aio_handle->ioctx, min, max, aio_handle->events,
260                                   wait_for_min ? NULL : &ts);
261     if (num_events < 0) {
262         return FDB_RESULT_AIO_GETEVENTS_FAIL;
263     }
264     return num_events;
265 #else
266     return FDB_RESULT_AIO_NOT_SUPPORTED;
267 #endif
268 }
269 
_filemgr_aio_destroy(struct async_io_handle *aio_handle)270 int _filemgr_aio_destroy(struct async_io_handle *aio_handle)
271 {
272 #ifdef _ASYNC_IO
273     if (!aio_handle) {
274         return FDB_RESULT_INVALID_ARGS;
275     }
276 
277     io_queue_release(aio_handle->ioctx);
278     for(size_t k = 0; k < aio_handle->queue_depth; ++k)
279     {
280         free(aio_handle->ioq[k]);
281     }
282     free(aio_handle->ioq);
283     free(aio_handle->events);
284     free_align(aio_handle->aio_buf);
285     free(aio_handle->offset_array);
286     return FDB_RESULT_SUCCESS;
287 #else
288     return FDB_RESULT_AIO_NOT_SUPPORTED;
289 #endif
290 }
291 
292 #if defined(__APPLE__) || defined(__FreeBSD__)
293 #include <sys/mount.h>
294 #elif !defined(__sun)
295 #include <sys/vfs.h>
296 #endif
297 
298 #ifndef BTRFS_SUPER_MAGIC
299 #define BTRFS_SUPER_MAGIC 0x9123683E
300 #endif
301 
302 #ifdef HAVE_BTRFS_IOCTL_H
303 #include <btrfs/ioctl.h>
304 #else
305 #include <sys/ioctl.h>
306 #ifndef BTRFS_IOCTL_MAGIC
307 #define BTRFS_IOCTL_MAGIC 0x94
308 #endif //BTRFS_IOCTL_MAGIC
309 
310 struct btrfs_ioctl_clone_range_args {
311     int64_t src_fd;
312     uint64_t src_offset;
313     uint64_t src_length;
314     uint64_t dest_offset;
315 };
316 
317 #define _IOC_NRBITS     8
318 #define _IOC_TYPEBITS   8
319 
320 #ifndef _IOC_SIZEBITS
321 # define _IOC_SIZEBITS  14
322 #endif
323 
324 #ifndef _IOC_DIRBITS
325 # define _IOC_DIRBITS   2
326 #endif
327 
328 #define _IOC_NRSHIFT    0
329 #define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
330 #define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
331 #define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
332 
333 #ifndef _IOC_WRITE
334 # define _IOC_WRITE     1U
335 #endif
336 
337 #ifndef _IOC
338 #define _IOC(dir,type,nr,size) \
339         (((dir)  << _IOC_DIRSHIFT) | \
340         ((type) << _IOC_TYPESHIFT) | \
341         ((nr)   << _IOC_NRSHIFT) | \
342         ((size) << _IOC_SIZESHIFT))
343 #endif // _IOC
344 
345 #define _IOC_TYPECHECK(t) (sizeof(t))
346 #ifndef _IOW
347 #define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),\
348                           (_IOC_TYPECHECK(size)))
349 #endif //_IOW
350 
351 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
352                               struct btrfs_ioctl_clone_range_args)
353 #endif // HAVE_BTRFS_IOCTL_H
354 
355 #ifndef EXT4_SUPER_MAGIC
356 #define EXT4_SUPER_MAGIC 0xEF53
357 #endif
358 
359 #ifndef EXT4_IOC_TRANFER_BLK_OWNERSHIP
360 /* linux/fs/ext4/ext4.h */
361 #define EXT4_IOC_TRANFER_BLK_OWNERSHIP  _IOWR('f', 22, struct tranfer_blk_ownership)
362 
363 struct tranfer_blk_ownership {
364     int32_t dest_fd;           /* destination file decriptor */
365     uint64_t src_start;        /* logical start offset in block for src */
366     uint64_t dest_start;       /* logical start offset in block for dest */
367     uint64_t len;              /* block length to be onwership-transfered */
368 };
369 #endif // EXT4_IOC_TRANSFER_BLK_OWNERSHIP
370 
371 #ifndef __sun
372 static
_filemgr_linux_ext4_share_blks(int src_fd, int dst_fd, uint64_t src_off, uint64_t dst_off, uint64_t len)373 int _filemgr_linux_ext4_share_blks(int src_fd, int dst_fd, uint64_t src_off,
374                                    uint64_t dst_off, uint64_t len)
375 {
376     int err;
377     struct tranfer_blk_ownership tbo;
378     tbo.dest_fd = dst_fd;
379     tbo.src_start = src_off;
380     tbo.dest_start = dst_off;
381     tbo.len = len;
382     err = ioctl(src_fd, EXT4_IOC_TRANFER_BLK_OWNERSHIP, &tbo);
383     if (err) {
384         return errno;
385     }
386     return err;
387 }
388 #endif
389 
_filemgr_linux_get_fs_type(int src_fd)390 int _filemgr_linux_get_fs_type(int src_fd)
391 {
392 #ifdef __sun
393     // No support for ZFS
394     return FILEMGR_FS_NO_COW;
395 #else
396     int ret;
397     struct statfs sfs;
398     ret = fstatfs(src_fd, &sfs);
399     if (ret != 0) {
400         return FDB_RESULT_INVALID_ARGS;
401     }
402     switch (sfs.f_type) {
403         case EXT4_SUPER_MAGIC:
404             ret = _filemgr_linux_ext4_share_blks(src_fd, src_fd, 0, 0, 0);
405             if (ret == 0) {
406                 ret = FILEMGR_FS_EXT4_WITH_COW;
407             } else {
408                 ret = FILEMGR_FS_NO_COW;
409             }
410             break;
411         case BTRFS_SUPER_MAGIC:
412             ret = FILEMGR_FS_BTRFS;
413             break;
414         default:
415             ret = FILEMGR_FS_NO_COW;
416     }
417     return ret;
418 #endif
419 }
420 
_filemgr_linux_copy_file_range(int fs_type, int src_fd, int dst_fd, uint64_t src_off, uint64_t dst_off, uint64_t len)421 int _filemgr_linux_copy_file_range(int fs_type,
422                                    int src_fd, int dst_fd, uint64_t src_off,
423                                    uint64_t dst_off, uint64_t len)
424 {
425     int ret = (int)FDB_RESULT_INVALID_ARGS;
426 #ifndef __sun
427     if (fs_type == FILEMGR_FS_BTRFS) {
428         struct btrfs_ioctl_clone_range_args cr_args;
429 
430         memset(&cr_args, 0, sizeof(cr_args));
431         cr_args.src_fd = src_fd;
432         cr_args.src_offset = src_off;
433         cr_args.src_length = len;
434         cr_args.dest_offset = dst_off;
435         ret = ioctl(dst_fd, BTRFS_IOC_CLONE_RANGE, &cr_args);
436         if (ret != 0) { // LCOV_EXCL_START
437             ret = errno;
438         }              // LCOV_EXCL_STOP
439     } else if (fs_type == FILEMGR_FS_EXT4_WITH_COW) {
440         ret = _filemgr_linux_ext4_share_blks(src_fd, dst_fd, src_off,
441                                              dst_off, len);
442     }
443 #endif
444     return ret;
445 }
446 
447 struct filemgr_ops linux_ops = {
448     _filemgr_linux_open,
449     _filemgr_linux_pwrite,
450     _filemgr_linux_pread,
451     _filemgr_linux_close,
452     _filemgr_linux_goto_eof,
453     _filemgr_linux_file_size,
454     _filemgr_linux_fdatasync,
455     _filemgr_linux_fsync,
456     _filemgr_linux_get_errno_str,
457     // Async I/O operations
458     _filemgr_aio_init,
459     _filemgr_aio_prep_read,
460     _filemgr_aio_submit,
461     _filemgr_aio_getevents,
462     _filemgr_aio_destroy,
463     _filemgr_linux_get_fs_type,
464     _filemgr_linux_copy_file_range
465 };
466 
get_linux_filemgr_ops()467 struct filemgr_ops * get_linux_filemgr_ops()
468 {
469     return &linux_ops;
470 }
471 
472 #endif
473