Project

General

Profile

Statistics
| Branch: | Revision:

gdp-if / tensorflow / gdpfs.h @ master

History | View | Annotate | Download (11.7 KB)

1
/* vim: set ai sw=4 sts=4 ts=4 :*/
2

    
3
/*
4
** A virtual GDP-based filesystem implementation.
5
**
6
** A file system is anchored by a "meta log", which is the root of
7
** the filesystem and carries the equivalent of all the "inodes" ---
8
** essentially the metadata for the file system.  The first record in
9
** that log has to be a GDPfsMeta entry.
10
**
11
** Directories exist only in the meta-log.  Each directory entry
12
** encodes the operation (GDPfs::ADD to add a new entry or GDPfs::DEL
13
** to delete an entry, the file-system name of the entry, the type
14
** of the entry (GDPfs::DIR for a nested directory or GDPfs::FILE
15
** for a leaf node), and if it is a file, the name of the log containing
16
** the contents of that file.
17
**
18
** Files are stored in separate logs, each of which has a randomized
19
** name.  Files are a sequence of GDPfsFchunk records which are
20
** concatenated to get the contents of the file.
21
**
22
** Disclaimer: I'm doing this description based on my understanding of
23
** how it works based on reading the code.  It may not be completely
24
** accurate, and doesn't match some of the other comments by the
25
** original authors.   - EA 7/2020.
26
*/
27

    
28
#include <iostream>
29
#include <cstdint>
30
#include <map>
31
#include <vector>
32
#include <string>
33
#include <assert.h>
34
#include <sys/time.h>
35

    
36
#ifndef GDP_FS_CAAPI_H_
37
#define GDP_FS_CAAPI_H_
38

    
39
#include "GDPfs.pb.h"
40

    
41
extern "C" {
42
        #include <gdp/gdp.h>
43
        #include <ep/ep_dbg.h>
44
        #include <ep/ep_app.h>
45
        #include <ep/ep_crypto.h>
46
}
47

    
48
#define MAX_RECSIZE 65536                // Max data in a single record
49
#define MAX_WRITESIZE 32768                // Max size of a write (excluding protobuf
50
                                                                //                overhead)
51

    
52
// Error codes and such
53
typedef enum
54
{
55
        kSuccess = 0,                                // generic success code
56
        kFailure = -1,                                // generic failure code
57

    
58
        kBadConfig = -2,                        // bad configuration
59
        kInvalidArg = -3,                        // invalid argument
60

    
61
        kGdpError = -4,                                // generic error by the GDP library
62
        kGdpNameError = -5,                        // GDP library error during name parsing
63
        kGdpErrorCreate = -6,                // GDP library error during creation
64

    
65
        kProtoParsingError = -10,        // error during parsing message
66

    
67
} GdpfsStatus;
68

    
69
// our version of a stat structure, very similar to
70
// tensorflow FileStatistics
71
struct GdpStat
72
{
73
        int64_t length = -1;
74
        int64_t mtime_nsec = 0;
75
        bool is_directory = false;
76

    
77
        GdpStat() {}
78
        GdpStat(int64_t length, int64_t mtime_nsec, bool is_directory)
79
                        : length(length), mtime_nsec(mtime_nsec),
80
                                is_directory(is_directory) {}
81
        ~GdpStat() {}
82

    
83
};
84

    
85

    
86
/************************** FUNCTIONS *********************************/
87

    
88
// Get time since epoch in ns.
89
uint64_t TimeNS();
90

    
91
// Initializes the library. "mode" provides a general global I/O mode,
92
// invidual usage can be more restrictive. i.e. memory-mapped files are
93
// always in RO mode, even if the initialization mode allows writing.
94
// "debug_setting" overrrides any config file settings for debugging
95
// output.
96
GdpfsStatus GDPfsInit(gdp_iomode_t mode=GDP_MODE_RO,
97
                                                const char* debug_setting=NULL);
98

    
99
// Create a null terminated string of length "len" (including the null)
100
// and store it at provided memory location. If "cheat" is true, then
101
// a deterministic string is generated instead of truly random string,
102
// usually resulting in much faster creation.
103
// Assumes that "s" can hold "len" number of bytes.
104
void RandData(char *s, const int len, bool cheat=false);
105

    
106
// Returns the full path (fname) split into individual components.
107
// Uses "://" as a delimiter for scheme, and "/" as a delimiter for
108
// path components. An example:
109
// "gdp://x/y/z" => "scheme" = "gdp", "parts"={"x", "y", "z"}
110
void SplitPath(const std::string& fname,
111
                                        std::string* scheme,
112
                                        std::vector<std::string> *parts);
113

    
114
// Takes in a full path (including "gdp://") and returns the
115
// name of the directory log in "topdir" and everything else
116
// in "remaining". As a side effect, performs sanitization of
117
// the path (duplicated '//', etc).
118
// e.g. "gdp://x/y/z" => "topdir" = "x", "remaining" = "y/z"
119
// Returns error when the path can not be parsed as a gdp path.
120
GdpfsStatus ParsePath(const std::string& fname,
121
                                        std::string* topdir,
122
                                        std::string* remaining);
123

    
124
// Splits the `fname` into `dirname` and `basename` (the last part
125
// of a name. Input name may or may not have the protocol
126
// (gdp://) included. However, dirname never includes this
127
// protocol name.
128
// e.g., "gdp://x/y/z" => dirname = "x/y", basename = "z".
129
void BaseDirName(const std::string& fname, std::string* dirname,
130
                                                std::string* basename);
131

    
132
// Returns whether the path is a valid file/directory name.
133
// At the moment, simply checks for presence of a '/' in the name.
134
bool NameValid(const std::string& path);
135

    
136
// Create a new log and return the full name in logname, open log in *ginp.
137
GdpfsStatus CreateLog(std::string *logname, gdp_gin_t **ginp = NULL);
138

    
139
// Parsing utility function. Given a string s, parse this as
140
// an GDPfsMsg set with GDPfsFchunk. Returns offset,
141
// length and actual data (as a string).
142
GdpfsStatus ParseMsgFchunk(const std::string& s, size_t *offset,
143
                                                  size_t *len, std::string *d);
144

    
145
// Initializes a root directory --- essentialy mkfs for GDPfs
146
GdpfsStatus GDPfsMkfs(const std::string& rootdir);
147

    
148

    
149
/****************************** CLASSES *******************************/
150

    
151
// A class for low level file access. This is the base class for all
152
// GDP files/directories. Defines quite a few common utility functions,
153
// not all of which are useful for every scenario.
154
//
155
// This is a bit strange since it is the base class for both files and
156
// directories.  Directories do not have their own GIN (`handle_`), so it's
157
// not clear why that field isn't in the GDPFile class instead of
158
// GDPFileLowLevel.
159
class GDPFileLowLevel
160
{
161
        public:
162

    
163
        // initialize a file/dir backed by given logname in the
164
        // specified mode. Also calls SyncLog to set up state
165
        // variables in some reasonable state.
166
        GDPFileLowLevel(const std::string& logname, gdp_iomode_t mode);
167
        ~GDPFileLowLevel();
168

    
169
        // check for freshness; updates: maxrecs_, type_, mtime_ns_
170
        void SyncLog();
171

    
172
        // Get the type (FILE/DIR). Also sync
173
        // with the server if sync==true
174
        GDPfs::FileType GetType(bool sync=false);
175

    
176
        // Get the number of records in the log.
177
        // Optionally, sync with the server.
178
        gdp_recno_t GetNumRecs(bool sync=false);
179

    
180
        // Get the last update time (in nano-seconds). Always sync.
181
        uint64_t GetMTime();
182

    
183
        protected:
184

    
185
        /********************* variables ************************/
186

    
187
        // some basic instance variables
188
        std::string logname_;        // human readable name for gin/log
189
        gdp_name_t gobname_;        // internal name for gin/log
190
        gdp_iomode_t mode_;                // mode (GDP_MODE_RO/GDP_MODE_RA/etc)
191
        gdp_gin_t* handle_;                // associated gin/log handle
192

    
193
        // some state variables based on our reading of the data
194
        GDPfs::FileType type_ = GDPfs::UNKNOWN_TYPE;        // FILE or DIR
195
        gdp_recno_t maxrecs_ = 0;                                                // num records in log
196
        uint64_t mtime_ns_ = 0;                                                        // last update time
197

    
198
        // TODO: we should be using strings in rec_cache_, instead of pointers
199
        std::map<gdp_recno_t, std::string*> rec_cache_; // cache for records
200

    
201
        /********************** methods *************************/
202

    
203
        // set the type for this file (FILE or DIR)
204
        void SetType(GDPfs::FileType type);
205

    
206
        // stores contents in string and returns the record number
207
        gdp_recno_t ReadRecord(gdp_recno_t recno, std::string *s);
208
        // returns the number of records read
209
        int32_t ReadRecordAsync(gdp_recno_t startrec, int32_t numrec,
210
                                                        std::string **s);
211
        // if async=true, returns without waiting for a response.
212
        GdpfsStatus AppendRecord(const std::string& s, bool async=false);
213

    
214
        // operating on record cache
215
        GdpfsStatus SetCache(const gdp_recno_t recno, const std::string *s);
216
        GdpfsStatus GetCache(const gdp_recno_t recno, std::string *s);
217
};
218

    
219

    
220
/**********************************************************************/
221
// For all intents and purposes, a GDPDir represents a file system
222
// rooted at a specific name
223
class GDPDir : public GDPFileLowLevel
224
{
225
        public:
226
        GDPDir(const std::string& logname, gdp_iomode_t mode);
227

    
228
        // internal state: a map of name=>logname
229
        std::map<std::string, std::string> dentries_;
230

    
231
        // create a new file+log; stores the logname. If recursive==true,
232
        // any non-existent paths on the way are created.
233
        void NewFile(const std::string name, std::string *logname,
234
                                        bool recursive=true);
235

    
236
        // create a directory. Any non-existent parent directories
237
        // are created if recursive==true
238
        void CreateDir(const std::string name, bool recursive=true);
239

    
240
        // Delete a file; only deletes/unlinks the entry (but not the log)
241
        void DeleteFile(std::string name) { DelEntry(name); }
242

    
243
        // Same as DeleteFile, but for directory.
244
        void DeleteDir(std::string name) { DelEntry(name); }
245

    
246
        // renaming files is much more easier than renaming directories.
247
        void RenameFile(const std::string& oldname,
248
                                const std::string& newname) { RenameEntry(oldname, newname); }
249

    
250
        // returns the list of children for a specific sub-directory, or
251
        // the current directory (if dirname is empty/null)
252
        void GetChildren(const std::string& dirname,
253
                                                std::vector<std::string>* children);
254

    
255
        // returns a list of matching path names based on a  minimally
256
        // implemented regex. For full details, please see the tensorflow
257
        // requirements.
258
        void GetMatchingPaths(const std::string& pattern,
259
                                                std::vector<std::string>* results);
260

    
261
        // Returns true if the name exists
262
        bool NameExists(const std::string& name);
263

    
264
        // populates a GdpStat structure
265
        void Stat(const std::string& name, GdpStat* stat);
266

    
267
        // Returns the corresponding logname for the given entry
268
        void GetEntryLogname(const std::string& name, std::string *logname);
269

    
270
        // Returns the type of the entry (FILE/DIR) for the given name
271
        GDPfs::FileType GetEntryType(const std::string& name);
272

    
273
        private:
274

    
275
        // create a brand new entry; also creates a new log in the process
276
        void NewEntry(std::string name, GDPfs::FileType t,
277
                                                std::string *logname, bool recursive);
278

    
279
        // add a pre-existing logname as a specific name.
280
        void AddEntry(std::string name, std::string logname, GDPfs::FileType t);
281

    
282
        // Deletes an entry (file/directory)
283
        void DelEntry(std::string name);
284

    
285
        // Renames one entry to another.
286
        void RenameEntry(const std::string& oldname, const std::string& newname);
287
};
288

    
289
/**********************************************************************/
290
class GDPFile : public GDPFileLowLevel
291
{
292
        public:
293
        GDPFile(const std::string& logname, gdp_iomode_t mode);
294

    
295
        // Returns the size of the file.
296
        size_t GetFileSize(bool sync=true);
297

    
298
        protected:
299
        size_t filesize_ = 0;
300

    
301
};
302

    
303
/**********************************************************************/
304
// A simple, read-only memory mapped file
305
class GDPFileROMemMap : public GDPFile
306
{
307
        public:
308
        GDPFileROMemMap(const std::string& logname, bool async=true);
309

    
310
        const char *data()
311
        {
312
                return filedata_.c_str();        // initialized in constructor
313
        }
314

    
315
        uint64_t length()
316
        {
317
                return filedata_.length();
318
        }
319

    
320
        protected:
321
        std::string filedata_;
322

    
323
        // Fetch records [startrec..endrec] (i.e. including both
324
        // extremes) and update the filedata_ internally.
325
        void FetchRecords(gdp_recno_t startrec, gdp_recno_t endrec,
326
                                           bool async=true);
327
};
328

    
329
/**********************************************************************/
330
// A read only file, based on the memory mapped file.
331
class GDPFileRO : public GDPFileROMemMap
332
{
333
        public:
334
        GDPFileRO(const std::string& logname, bool async=true)
335
                                        : GDPFileROMemMap(logname, async) {}
336

    
337
        // Reads at most 'len' bytes at 'offset', stores them in 'buf'.
338
        // Returns the actual number of bytes returned.
339
        size_t Read(size_t offset, size_t len, char *buf);
340
};
341

    
342
/**********************************************************************/
343
// an append only file; the name is a misnomer
344
class GDPFileWO : public GDPFile
345
{
346
        public:
347
        GDPFileWO(const std::string& logname)
348
                                        : GDPFile(logname, GDP_MODE_AO) {}
349

    
350
        // Append an (arbitrarily long) string, by potentially splitting
351
        // it across a number of records.
352
        void Append(const std::string& s, bool async=true);
353
};
354

    
355
#endif